From 273b02fd6cdc8b8f176f78dccc74ae9fc3841fb7 Mon Sep 17 00:00:00 2001 From: Stefan Kandic Date: Tue, 5 Nov 2024 14:03:41 +0100 Subject: [PATCH] [SPARK-50214][SQL] From json/xml should not change collations in the given schema ### What changes were proposed in this pull request? This fix ensures that `from_json` and `from_xml` return the exact schema provided, even when session collation is set. ### Why are the changes needed? When serializing schema with the `sql` method, parsing it back can yield a different schema if session collation is set. This fix maintains consistency in schema structure regardless of collation settings. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #48750 from stefankandic/fixParseSchema. Lead-authored-by: Stefan Kandic Co-authored-by: Hyukjin Kwon Signed-off-by: Max Gekk --- .../org/apache/spark/sql/functions.scala | 4 +- .../queries/function_from_json.json | 2 +- .../queries/function_from_json.proto.bin | Bin 221 -> 394 bytes .../queries/function_from_xml.json | 2 +- .../queries/function_from_xml.proto.bin | Bin 220 -> 393 bytes .../CollationSQLFunctionsSuite.scala | 60 ++++++++++++++++++ 6 files changed, 64 insertions(+), 4 deletions(-) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSQLFunctionsSuite.scala diff --git a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala index d7b61468b43d7..8c49952bc31e3 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala @@ -6809,7 +6809,7 @@ object functions { */ // scalastyle:on line.size.limit def from_json(e: Column, schema: DataType, options: Map[String, String]): Column = { - from_json(e, lit(schema.sql), options.iterator) + from_json(e, lit(schema.json), options.iterator) } // scalastyle:off line.size.limit @@ -7645,7 +7645,7 @@ object functions { */ // scalastyle:on line.size.limit def from_xml(e: Column, schema: StructType, options: java.util.Map[String, String]): Column = - from_xml(e, lit(schema.sql), options.asScala.iterator) + from_xml(e, lit(schema.json), options.asScala.iterator) // scalastyle:off line.size.limit /** diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.json b/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.json index 5af297b17f8b8..ddfa91abca05e 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.json @@ -20,7 +20,7 @@ } }, { "literal": { - "string": "STRUCT\u003cid: BIGINT, a: INT, b: DOUBLE\u003e" + "string": "{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"a\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"b\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}}]}" } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.proto.bin index 1752a847d272b9f33c9d54822b655dbbc968285e..ad95d0f2b343d1dc58a66f45ae9856689ae62881 100644 GIT binary patch literal 394 zcma*jv1-FG5C-60LWgP!9zEWCPjDDNNJNElH54d8R)pH<)V}YvTWHbF-_5c6? diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.json b/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.json index 3b229f6bc762d..cfcd40a74b3a7 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.json @@ -20,7 +20,7 @@ } }, { "literal": { - "string": "STRUCT\u003cid: BIGINT, a: INT, b: DOUBLE\u003e" + "string": "{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"a\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"b\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}}]}" } }] } diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.proto.bin index 60c1bd68fe336dd918c4b288ffdea60efbe25c5e..1cc3a26c254fb7a7cad9088be96f971259761e97 100644 GIT binary patch literal 393 zcma*j!Ait15C&jp7eR7bdhBt|^8`cbgRF>5O&TIex@1#uDZO|Sy!l>&;6n&Lg()lU zv0ehn{NKPIwb?=ayj5?YHoL3+D^xFV%xQecjnmvY>tZJ1C+JIpMtNhtE~cCeS-Rc$9I@8pAH5Y&pVN@ zcgE`@^6i9C!ZLEYJpac05N)}|(R(R9kV{WP7)%x|S&*lBravhcSXvls(*JFDvFATq Hqj&QUg_nZ| delta 79 zcmeBVzQf4Q#U#Keb(L`<`!styDH|@1w4(gn_=?;dAyy$~F2-~rO)m8;)!>kzQ0EYv f%oHmHCr@`zzYraTL@Na_on)op;veed<7x*0{|*y| diff --git a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSQLFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSQLFunctionsSuite.scala new file mode 100644 index 0000000000000..83ec8c8d1bafb --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSQLFunctionsSuite.scala @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.collation + +import org.apache.spark.sql.{Column, Dataset, QueryTest} +import org.apache.spark.sql.functions.{from_json, from_xml} +import org.apache.spark.sql.internal.SqlApiConf +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types._ + +class CollationSQLFunctionsSuite extends QueryTest with SharedSparkSession { + + test("SPARK-50214: from_json and from_xml work correctly with session collation") { + import testImplicits._ + + def checkSchema( + dataset: Dataset[String], + transformation: Column, + expectedSchema: StructType): Unit = { + val transformedSchema = dataset.select(transformation.as("result")).schema + assert(transformedSchema === expectedSchema) + } + + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE_CI_AI") { + Seq( + StringType, + StringType("UTF8_BINARY"), + StringType("UNICODE"), + StringType("UNICODE_CI_AI")).foreach { stringType => + val dataSchema = StructType(Seq(StructField("fieldName", stringType))) + val expectedSchema = StructType(Seq(StructField("result", dataSchema))) + + // JSON Test + val jsonData = Seq("""{"fieldName": "fieldValue"}""") + val jsonDataset = spark.createDataset(jsonData) + checkSchema(jsonDataset, from_json($"value", dataSchema), expectedSchema) + + // XML Test + val xmlData = Seq("fieldValue") + val xmlDataset = spark.createDataset(xmlData) + checkSchema(xmlDataset, from_xml($"value", dataSchema), expectedSchema) + } + } + } +}