Skip to content

Commit

Permalink
feat(chars): support unicode and octal char literals
Browse files Browse the repository at this point in the history
  • Loading branch information
Samy-33 committed Oct 15, 2024
1 parent a45f306 commit e661508
Show file tree
Hide file tree
Showing 4 changed files with 211 additions and 26 deletions.
62 changes: 53 additions & 9 deletions compiler+runtime/include/cpp/jank/read/parse.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#pragma once

#include <codecvt>

#include <jank/result.hpp>
#include <jank/option.hpp>
#include <jank/read/lex.hpp>
Expand All @@ -12,33 +14,75 @@ namespace jank::runtime
/* TODO: Rename file to processor. */
namespace jank::read::parse
{
static option<char> get_char_from_literal(native_persistent_string const &sv)
static option<native_persistent_string>
parse_character_in_base(native_persistent_string_view const &char_literal, int const base)
{
try
{
size_t chars_processed{};
auto const codepoint(
std::stol(native_persistent_string{ char_literal.data() + 2, char_literal.size() - 2 },
&chars_processed,
base));

/* Some characters that weren't processed at all.
* An example would be when trying to parse `12a` in octal base,
* `stol` will only parse `12` and will ignore `a`, returning `2`
* in `chars_processed`.
* We return none in that case, to represent to the downstream callers
* the enability to parse the char literal.
*
* Refer: https://en.cppreference.com/w/cpp/string/basic_string/stol
*/
if(chars_processed != char_literal.size() - 2)
{
return none;
}

std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
return native_persistent_string{ converter.to_bytes(codepoint) };
}
catch(std::range_error const)
{
return none;
}
catch(std::invalid_argument const)
{
return none;
}
catch(std::out_of_range const)
{
return none;
}
}

static option<char> get_char_from_literal(native_persistent_string const &s)
{
if(sv.size() == 2)
if(s.size() == 2)
{
return sv[1];
return s[1];
}
else if(sv == R"(\newline)")
else if(s == R"(\newline)")
{
return '\n';
}
else if(sv == R"(\space)")
else if(s == R"(\space)")
{
return ' ';
}
else if(sv == R"(\tab)")
else if(s == R"(\tab)")
{
return '\t';
}
else if(sv == R"(\backspace)")
else if(s == R"(\backspace)")
{
return '\b';
}
else if(sv == R"(\formfeed)")
else if(s == R"(\formfeed)")
{
return '\f';
}
else if(sv == R"(\return)")
else if(s == R"(\return)")
{
return '\r';
}
Expand Down
35 changes: 35 additions & 0 deletions compiler+runtime/src/cpp/jank/read/parse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,41 @@ namespace jank::read::parse

if(character.is_none())
{
/* Unicode */
if(sv[0] == '\\' && sv[1] == 'u')
{
/* Should always be of length 6 eg. \uaBc5 */
if(sv.size() != 6)
{
return err(error{ token.pos, fmt::format("invalid unicode character literal `{}`", sv) });
}

auto const char_bytes(
parse_character_in_base(native_persistent_string{ sv.data(), sv.size() }, 16));

if(char_bytes.is_some())
{
return object_source_info{ make_box<obj::character>(char_bytes.unwrap()), token, token };
}
}
/* Octal */
else if(sv[0] == '\\' && sv[1] == 'o')
{
/* Should always be of length 5 eg. \o056 */
if(sv.size() != 5)
{
return err(error{ token.pos, fmt::format("invalid octal character literal `{}`", sv) });
}

auto const char_bytes(
parse_character_in_base(native_persistent_string{ sv.data(), sv.size() }, 8));

if(char_bytes.is_some())
{
return object_source_info{ make_box<obj::character>(char_bytes.unwrap()), token, token };
}
}

return err(error{ token.pos, fmt::format("invalid character literal `{}`", sv) });
}

Expand Down
41 changes: 24 additions & 17 deletions compiler+runtime/src/cpp/jank/runtime/obj/character.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,31 @@

namespace jank::runtime
{
static native_persistent_string get_literal_from_char(char const ch)
static native_persistent_string get_literal_from_char_bytes(native_persistent_string const &bytes)
{
switch(ch)
if(bytes.size() == 1)
{
case '\n':
return R"(\newline)";
case ' ':
return R"(\space)";
case '\t':
return R"(\tab)";
case '\b':
return R"(\backspace)";
case '\f':
return R"(\formfeed)";
case '\r':
return R"(\return)";
default:
return fmt::format(R"(\{})", ch);
switch(bytes[0])
{
case '\n':
return R"(\newline)";
case ' ':
return R"(\space)";
case '\t':
return R"(\tab)";
case '\b':
return R"(\backspace)";
case '\f':
return R"(\formfeed)";
case '\r':
return R"(\return)";
default:
return fmt::format(R"(\{})", bytes[0]);
}
}
else
{
return fmt::format(R"(\{})", bytes);
}
}

Expand Down Expand Up @@ -57,7 +64,7 @@ namespace jank::runtime

native_persistent_string obj::character::to_code_string() const
{
return get_literal_from_char(data[0]);
return get_literal_from_char_bytes(data);
}

native_hash obj::character::to_hash() const
Expand Down
99 changes: 99 additions & 0 deletions compiler+runtime/test/cpp/jank/read/parse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,105 @@ namespace jank::read::parse
CHECK(r.expect_ok().unwrap().start
== lex::token{ 9, 10, lex::token_kind::character, "\\backspace" });
}

SUBCASE("Unicode")
{
SUBCASE("Valid")
{
lex::processor lp{ R"(\u1234 \u5678 \u90ab \ucdef \uABCD \uEFa0)" };
processor p{ lp.begin(), lp.end() };

size_t offset{};
for(native_persistent_string const &ch :
{ "\\u1234", "\\u5678", "\\u90ab", "\\ucdef", "\\uABCD", "\\uEFa0" })
{
auto const r(p.next());
CHECK(equal(r.expect_ok().unwrap().ptr,
make_box<obj::character>(parse_character_in_base(ch, 16).unwrap())));

auto const len(ch.size());
CHECK(r.expect_ok().unwrap().start
== lex::token{ offset, len, lex::token_kind::character, ch });
CHECK(r.expect_ok().unwrap().end == r.expect_ok().unwrap().start);

/* +1 for space */
offset += len + 1;
}
}

SUBCASE("Invalid length")
{
lex::processor lp{ R"(\u123 \ucd \u1 \u12345)" };
processor p{ lp.begin(), lp.end() };

for(size_t i{}; i < 4; ++i)
{
auto const r(p.next());
CHECK(r.is_err());
}
}

SUBCASE("Invalid unicode characters")
{
lex::processor lp{ R"(\uabcg \u120x \uza19 \u1Gab)" };
processor p{ lp.begin(), lp.end() };

for(size_t i{}; i < 4; ++i)
{
auto const r(p.next());
CHECK(r.is_err());
}
}
}

SUBCASE("Octal")
{
SUBCASE("Valid")
{
lex::processor lp{ R"(\o012 \o345 \o670)" };
processor p{ lp.begin(), lp.end() };

size_t offset{};
for(native_persistent_string const &ch : { "\\o012", "\\o345", "\\o670" })
{
auto const r(p.next());
CHECK(equal(r.expect_ok().unwrap().ptr,
make_box<obj::character>(parse_character_in_base(ch, 8).unwrap())));

auto const len(ch.size());
CHECK(r.expect_ok().unwrap().start
== lex::token{ offset, len, lex::token_kind::character, ch });
CHECK(r.expect_ok().unwrap().end == r.expect_ok().unwrap().start);

/* +1 for space */
offset += len + 1;
}
}

SUBCASE("Invalid length")
{
lex::processor lp{ R"(\o1 \o23 \o4567 \o0123454)" };
processor p{ lp.begin(), lp.end() };

for(size_t i{}; i < 4; ++i)
{
auto const r(p.next());
CHECK(r.is_err());
}
}

SUBCASE("Invalid ocatal character")
{
lex::processor lp{ R"(\o128 \o962 \oAaa \oxf0)" };
processor p{ lp.begin(), lp.end() };

for(size_t i{}; i < 4; ++i)
{
auto const r(p.next());
CHECK(r.is_err());
}
}
}
}

TEST_CASE("String")
Expand Down

0 comments on commit e661508

Please sign in to comment.