From cc9e18523da9a7f2ed61eeb5ab86f68d939870ce Mon Sep 17 00:00:00 2001 From: Raiki Tamura Date: Wed, 28 Jun 2023 19:14:50 +0900 Subject: [PATCH] gccrs: fix tokenizing utf-8 whitespaces gcc/rust/ChangeLog: * lex/rust-lex.cc (Lexer::build_token):add check for all kinds of whitespaces gcc/testsuite/ChangeLog: * rust/compile/torture/utf8_whitespaces.rs: New test. Signed-off-by: Raiki Tamura --- gcc/rust/lex/rust-lex.cc | 13 +++++++++++-- .../rust/compile/torture/utf8_whitespaces.rs | 16 ++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/rust/compile/torture/utf8_whitespaces.rs diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc index 43acdf070210..28f38638fb51 100644 --- a/gcc/rust/lex/rust-lex.cc +++ b/gcc/rust/lex/rust-lex.cc @@ -420,7 +420,10 @@ Lexer::build_token () { /* ignore whitespace characters for tokens but continue updating * location */ - case '\n': // newline + case '\n': // newline + case 0x0085: // next line + case 0x2028: // line separator + case 0x2029: // paragraph separator current_line++; current_column = 1; // tell line_table that new line starts @@ -432,10 +435,16 @@ Lexer::build_token () case ' ': // space current_column++; continue; - case '\t': // tab + case '\t': // horizontal tab // width of a tab is not well-defined, assume 8 spaces current_column += 8; continue; + case '\v': // vertical tab + case 0x000c: // form feed + case 0x200e: // left-to-right mark + case 0x200f: // right-to-left mark + // Ignored. + continue; // punctuation - actual tokens case '=': diff --git a/gcc/testsuite/rust/compile/torture/utf8_whitespaces.rs b/gcc/testsuite/rust/compile/torture/utf8_whitespaces.rs new file mode 100644 index 000000000000..b45c014812fb --- /dev/null +++ b/gcc/testsuite/rust/compile/torture/utf8_whitespaces.rs @@ -0,0 +1,16 @@ +fn main() { + // FORM FEED + + // LINE TABULATION (vt) + + // NEXT LINE (nel) + … + // LEFT-TO-RIGHT MARK + ‎ + // RIGHT-TO-LEFT MARK + ‏ + // LINE SEPARATOR + 
 + // PARAGRAPH SEPARATOR + 
 +}