Skip to content

Commit

Permalink
[SPARK-50214][SQL] From json/xml should not change collations in the …
Browse files Browse the repository at this point in the history
…given schema

### What changes were proposed in this pull request?
This fix ensures that `from_json` and `from_xml` return the exact schema provided, even when session collation is set.

### Why are the changes needed?
When serializing schema with the `sql` method, parsing it back can yield a different schema if session collation is set. This fix maintains consistency in schema structure regardless of collation settings.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
New unit tests.

### Was this patch authored or co-authored using generative AI tooling?
No.

Closes #48750 from stefankandic/fixParseSchema.

Lead-authored-by: Stefan Kandic <stefan.kandic@databricks.com>
Co-authored-by: Hyukjin Kwon <gurwls223@gmail.com>
Signed-off-by: Max Gekk <max.gekk@gmail.com>
  • Loading branch information
2 people authored and MaxGekk committed Nov 5, 2024
1 parent 642a62b commit 273b02f
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 4 deletions.
4 changes: 2 additions & 2 deletions sql/api/src/main/scala/org/apache/spark/sql/functions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6809,7 +6809,7 @@ object functions {
*/
// scalastyle:on line.size.limit
def from_json(e: Column, schema: DataType, options: Map[String, String]): Column = {
from_json(e, lit(schema.sql), options.iterator)
from_json(e, lit(schema.json), options.iterator)
}

// scalastyle:off line.size.limit
Expand Down Expand Up @@ -7645,7 +7645,7 @@ object functions {
*/
// scalastyle:on line.size.limit
def from_xml(e: Column, schema: StructType, options: java.util.Map[String, String]): Column =
from_xml(e, lit(schema.sql), options.asScala.iterator)
from_xml(e, lit(schema.json), options.asScala.iterator)

// scalastyle:off line.size.limit
/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
}
}, {
"literal": {
"string": "STRUCT\u003cid: BIGINT, a: INT, b: DOUBLE\u003e"
"string": "{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"a\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"b\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}}]}"
}
}]
}
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
}
}, {
"literal": {
"string": "STRUCT\u003cid: BIGINT, a: INT, b: DOUBLE\u003e"
"string": "{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"a\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"b\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}}]}"
}
}]
}
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.sql.collation

import org.apache.spark.sql.{Column, Dataset, QueryTest}
import org.apache.spark.sql.functions.{from_json, from_xml}
import org.apache.spark.sql.internal.SqlApiConf
import org.apache.spark.sql.test.SharedSparkSession
import org.apache.spark.sql.types._

class CollationSQLFunctionsSuite extends QueryTest with SharedSparkSession {

test("SPARK-50214: from_json and from_xml work correctly with session collation") {
import testImplicits._

def checkSchema(
dataset: Dataset[String],
transformation: Column,
expectedSchema: StructType): Unit = {
val transformedSchema = dataset.select(transformation.as("result")).schema
assert(transformedSchema === expectedSchema)
}

withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE_CI_AI") {
Seq(
StringType,
StringType("UTF8_BINARY"),
StringType("UNICODE"),
StringType("UNICODE_CI_AI")).foreach { stringType =>
val dataSchema = StructType(Seq(StructField("fieldName", stringType)))
val expectedSchema = StructType(Seq(StructField("result", dataSchema)))

// JSON Test
val jsonData = Seq("""{"fieldName": "fieldValue"}""")
val jsonDataset = spark.createDataset(jsonData)
checkSchema(jsonDataset, from_json($"value", dataSchema), expectedSchema)

// XML Test
val xmlData = Seq("<root><fieldName>fieldValue</fieldName></root>")
val xmlDataset = spark.createDataset(xmlData)
checkSchema(xmlDataset, from_xml($"value", dataSchema), expectedSchema)
}
}
}
}

0 comments on commit 273b02f

Please sign in to comment.