[SPARK-50214][SQL] From json/xml should not change collations in the …

…given schema ### What changes were proposed in this pull request? This fix ensures that `from_json` and `from_xml` return the exact schema provided, even when session collation is set. ### Why are the changes needed? When serializing schema with the `sql` method, parsing it back can yield a different schema if session collation is set. This fix maintains consistency in schema structure regardless of collation settings. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #48750 from stefankandic/fixParseSchema. Lead-authored-by: Stefan Kandic <stefan.kandic@databricks.com> Co-authored-by: Hyukjin Kwon <gurwls223@gmail.com> Signed-off-by: Max Gekk <max.gekk@gmail.com>
apache · Nov 5, 2024 · 273b02f · 273b02f
1 parent 642a62b
commit 273b02f
Show file tree

Hide file tree

Showing 6 changed files with 64 additions and 4 deletions.
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
@@ -6809,7 +6809,7 @@ object functions {
    */
   // scalastyle:on line.size.limit
   def from_json(e: Column, schema: DataType, options: Map[String, String]): Column = {
-    from_json(e, lit(schema.sql), options.iterator)
+    from_json(e, lit(schema.json), options.iterator)
   }
 
   // scalastyle:off line.size.limit
@@ -7645,7 +7645,7 @@ object functions {
    */
   // scalastyle:on line.size.limit
   def from_xml(e: Column, schema: StructType, options: java.util.Map[String, String]): Column =
-    from_xml(e, lit(schema.sql), options.asScala.iterator)
+    from_xml(e, lit(schema.json), options.asScala.iterator)
 
   // scalastyle:off line.size.limit
   /**

diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.json b/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.json
@@ -20,7 +20,7 @@
           }
         }, {
           "literal": {
-            "string": "STRUCT\u003cid: BIGINT, a: INT, b: DOUBLE\u003e"
+            "string": "{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"a\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"b\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}}]}"
           }
         }]
       }

diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.proto.bin
diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.json b/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.json
@@ -20,7 +20,7 @@
           }
         }, {
           "literal": {
-            "string": "STRUCT\u003cid: BIGINT, a: INT, b: DOUBLE\u003e"
+            "string": "{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"a\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"b\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}}]}"
           }
         }]
       }

diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.proto.bin
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSQLFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSQLFunctionsSuite.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.collation
+
+import org.apache.spark.sql.{Column, Dataset, QueryTest}
+import org.apache.spark.sql.functions.{from_json, from_xml}
+import org.apache.spark.sql.internal.SqlApiConf
+import org.apache.spark.sql.test.SharedSparkSession
+import org.apache.spark.sql.types._
+
+class CollationSQLFunctionsSuite extends QueryTest with SharedSparkSession {
+
+  test("SPARK-50214: from_json and from_xml work correctly with session collation") {
+    import testImplicits._
+
+    def checkSchema(
+        dataset: Dataset[String],
+        transformation: Column,
+        expectedSchema: StructType): Unit = {
+      val transformedSchema = dataset.select(transformation.as("result")).schema
+      assert(transformedSchema === expectedSchema)
+    }
+
+    withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE_CI_AI") {
+      Seq(
+        StringType,
+        StringType("UTF8_BINARY"),
+        StringType("UNICODE"),
+        StringType("UNICODE_CI_AI")).foreach { stringType =>
+        val dataSchema = StructType(Seq(StructField("fieldName", stringType)))
+        val expectedSchema = StructType(Seq(StructField("result", dataSchema)))
+
+        // JSON Test
+        val jsonData = Seq("""{"fieldName": "fieldValue"}""")
+        val jsonDataset = spark.createDataset(jsonData)
+        checkSchema(jsonDataset, from_json($"value", dataSchema), expectedSchema)
+
+        // XML Test
+        val xmlData = Seq("<root><fieldName>fieldValue</fieldName></root>")
+        val xmlDataset = spark.createDataset(xmlData)
+        checkSchema(xmlDataset, from_xml($"value", dataSchema), expectedSchema)
+      }
+    }
+  }
+}