Skip to content

Commit

Permalink
fix bytes parsing from JSON string with UTF16 pairs
Browse files Browse the repository at this point in the history
  • Loading branch information
xiangjinwu committed Apr 19, 2024
1 parent f33bf6a commit e1c6366
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 25 deletions.
90 changes: 67 additions & 23 deletions lang/rust/avro/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -861,10 +861,14 @@ impl Value {
}
}

fn to_constrained_utf16(s: &str) -> Vec<u8> {
s.encode_utf16().map(|u| u.to_le_bytes()[0]).collect()
}

fn resolve_bytes(self) -> Result<Self, Error> {
match self {
Value::Bytes(bytes) => Ok(Value::Bytes(bytes)),
Value::String(s) => Ok(Value::Bytes(s.into_bytes())),
Value::String(s) => Ok(Value::Bytes(Self::to_constrained_utf16(&s))),
Value::Array(items) => Ok(Value::Bytes(
items
.into_iter()
Expand All @@ -878,9 +882,10 @@ impl Value {
fn resolve_string(self) -> Result<Self, Error> {
match self {
Value::String(s) => Ok(Value::String(s)),
Value::Bytes(bytes) | Value::Fixed(_, bytes) => Ok(Value::String(
String::from_utf8(bytes).map_err(Error::ConvertToUtf8)?,
)),
// Be conservative for now. It is NOT UTF8.
// Value::Bytes(bytes) | Value::Fixed(_, bytes) => Ok(Value::String(
// String::from_utf8(bytes).map_err(Error::ConvertToUtf8)?,
// )),
other => Err(Error::GetString(other.into())),
}
}
Expand All @@ -894,7 +899,10 @@ impl Value {
Err(Error::CompareFixedSizes { size, n })
}
}
Value::String(s) => Ok(Value::Fixed(s.len(), s.into_bytes())),
Value::String(s) => {
let bytes = Self::to_constrained_utf16(&s);
Ok(Value::Fixed(bytes.len(), bytes))
}
other => Err(Error::GetStringForFixed(other.into())),
}
}
Expand Down Expand Up @@ -1582,27 +1590,27 @@ Field with name '"b"' is not a member of the map items"#,
Ok(())
}

#[test]
fn resolve_string_from_bytes() -> TestResult {
let value = Value::Bytes(vec![97, 98, 99]);
assert_eq!(
value.resolve(&Schema::String)?,
Value::String("abc".to_string())
);
// #[test]
// fn resolve_string_from_bytes() -> TestResult {
// let value = Value::Bytes(vec![97, 98, 99]);
// assert_eq!(
// value.resolve(&Schema::String)?,
// Value::String("abc".to_string())
// );

Ok(())
}
// Ok(())
// }

#[test]
fn resolve_string_from_fixed() -> TestResult {
let value = Value::Fixed(3, vec![97, 98, 99]);
assert_eq!(
value.resolve(&Schema::String)?,
Value::String("abc".to_string())
);
// #[test]
// fn resolve_string_from_fixed() -> TestResult {
// let value = Value::Fixed(3, vec![97, 98, 99]);
// assert_eq!(
// value.resolve(&Schema::String)?,
// Value::String("abc".to_string())
// );

Ok(())
}
// Ok(())
// }

#[test]
fn resolve_bytes_failure() {
Expand Down Expand Up @@ -2926,4 +2934,40 @@ Field with name '"b"' is not a member of the map items"#,

Ok(())
}

#[test]
fn test_bytes_default() -> TestResult {
let old_schema = Schema::parse_str(r#"{"type":"record","name":"Root","fields":[]}"#)?;

for (input, expected) in [
(r#" "default": "\u00FF" "#, &[0xffu8] as &[u8]),
(r#" "default": "\uD834\uDD1E" "#, &[0x34, 0x1e]),
] {
let schema = format!(
r#"{{
"type": "record",
"name": "Root",
"fields": [
{{
"name": "f0",
"type": "bytes",
{input}
}}
]
}}"#
);
let root = Schema::parse_str(&schema)?;
let actual = crate::from_avro_datum(&old_schema, &mut std::io::empty(), Some(&root))?;
let actual = match &actual {
Value::Record(vals) => match &vals[0].1 {
Value::Bytes(b) => b.as_slice(),
_ => unreachable!(),
},
_ => unreachable!(),
};
assert_eq!(expected, actual);
}

Ok(())
}
}
4 changes: 2 additions & 2 deletions lang/rust/avro/tests/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,13 @@ lazy_static! {
(r#""boolean""#, "true", Value::Boolean(true)),
(r#""string""#, r#""foo""#, Value::String("foo".to_string())),
(r#""bytes""#, r#""a""#, Value::Bytes(vec![97])), // ASCII 'a' => one byte
(r#""bytes""#, r#""\u00FF""#, Value::Bytes(vec![195, 191])), // The value is between U+0080 and U+07FF => two bytes
(r#""bytes""#, r#""\u00FF""#, Value::Bytes(vec![255])), // The value is between U+0080 and U+07FF => UTF-16
(r#""int""#, "5", Value::Int(5)),
(r#""long""#, "5", Value::Long(5)),
(r#""float""#, "1.1", Value::Float(1.1)),
(r#""double""#, "1.1", Value::Double(1.1)),
(r#"{"type": "fixed", "name": "F", "size": 2}"#, r#""a""#, Value::Fixed(1, vec![97])), // ASCII 'a' => one byte
(r#"{"type": "fixed", "name": "F", "size": 2}"#, r#""\u00FF""#, Value::Fixed(2, vec![195, 191])), // The value is between U+0080 and U+07FF => two bytes
(r#"{"type": "fixed", "name": "F", "size": 2}"#, r#""\u00FF""#, Value::Fixed(1, vec![255])), // The value is between U+0080 and U+07FF => UTF-16
(r#"{"type": "enum", "name": "F", "symbols": ["FOO", "BAR"]}"#, r#""FOO""#, Value::Enum(0, "FOO".to_string())),
(r#"{"type": "array", "items": "int"}"#, "[1, 2, 3]", Value::Array(vec![Value::Int(1), Value::Int(2), Value::Int(3)])),
(r#"{"type": "map", "values": "int"}"#, r#"{"a": 1, "b": 2}"#, Value::Map([("a".to_string(), Value::Int(1)), ("b".to_string(), Value::Int(2))].iter().cloned().collect())),
Expand Down

0 comments on commit e1c6366

Please sign in to comment.