From 8c7623268bcf4006f8792b2c86122d239e348589 Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Tue, 13 Aug 2024 00:36:55 +0200 Subject: Fix unicode errors --- cook/parse-internals.sld | 3 +- cook/parse.scm | 8 +- cook/parse.sld | 3 +- cook/unicode.sld | 201 +++++++++++++++++++++++++++++++++++++++++++++++ tests/parse.scm | 4 +- 5 files changed, 208 insertions(+), 11 deletions(-) create mode 100644 cook/unicode.sld diff --git a/cook/parse-internals.sld b/cook/parse-internals.sld index 1728f1d..d73a3c3 100644 --- a/cook/parse-internals.sld +++ b/cook/parse-internals.sld @@ -8,7 +8,8 @@ (chibi char-set) (chibi parse) (chibi regexp) - (chibi string)) + (chibi string) + (cook unicode)) (export amount? amount-quantity amount-unit diff --git a/cook/parse.scm b/cook/parse.scm index 0fc4744..fd6c70c 100644 --- a/cook/parse.scm +++ b/cook/parse.scm @@ -85,12 +85,6 @@ https://github.com/cooklang/spec/blob/main/EBNF.md lis)) -(define whitespace-chars (char-set-union - (char-set #\space #\x00A0 #\x1680) - (char-set #\x202F #\x205F #\x3000) - (ucs-range->char-set #x2000 #x200B))) -(define newline-chars (char-set #\x000A #\x000D #\x0085 #\x2028 #\x2029)) -(define punctuation-chars (char-set #\. #\{ #\})) ;; TODO: do it right (define word-chars (char-set-difference char-set:full punctuation-chars newline-chars @@ -101,7 +95,7 @@ https://github.com/cooklang/spec/blob/main/EBNF.md (char-set #\@ #\# #\~))) (define unit-chars (char-set-difference text-chars (char-set #\}))) (define component-chars (char-set-difference text-chars - (char-set #\{ #\}))) + punctuation-chars)) (define component-word-chars (char-set-difference component-chars newline-chars whitespace-chars)) diff --git a/cook/parse.sld b/cook/parse.sld index 1dcfb35..1af9734 100644 --- a/cook/parse.sld +++ b/cook/parse.sld @@ -8,7 +8,8 @@ (chibi char-set) (chibi parse) (chibi regexp) - (chibi string)) + (chibi string) + (cook unicode)) (export amount? amount-quantity amount-unit diff --git a/cook/unicode.sld b/cook/unicode.sld new file mode 100644 index 0000000..9d3cbcd --- /dev/null +++ b/cook/unicode.sld @@ -0,0 +1,201 @@ +(define-library (cook unicode) + (import (chibi) + (srfi 14)) + (export whitespace-chars + newline-chars + punctuation-chars) + (begin + (define whitespace-chars (char-set-union + (char-set #\space #\x00A0 #\x1680) + (char-set #\x202F #\x205F #\x3000) + (ucs-range->char-set #x2000 #x200B))) + (define newline-chars + (char-set #\x000A #\x000D #\x0085 #\x2028 #\x2029)) + (define punctuation-chars + (char-set-union + ;; Pc class + (char-set #\x005F #\x203F #\x2040 #\x2054 #\xFE33 #\xFE34 #\xFE4D + #\xFE4E #\xFE4F #\xFF3F) + ;; Pd + (char-set #\x002D #\x058A #\x05BE #\x1400 #\x1806 #\x2010 #\x2011 + #\x2012 #\x2013 #\x2014 #\x2015 #\x2E17 #\x2E1A #\x2E3A + #\x2E3B #\x2E40 #\x301C #\x3030 #\x30A0 #\xFE31 #\xFE32 + #\xFE58 #\xFE63 #\xFF0D #\x10EAD) + + ;; Pe + (char-set #\x0029 #\x005D #\x007D #\x0F3B #\x0F3D #\x169C #\x2046 + #\x207E #\x208E #\x2309 #\x230B #\x232A #\x2769 #\x276B + #\x276D #\x276F #\x2771 #\x2773 #\x2775 #\x27C6 #\x27E7 + #\x27E9 #\x27EB #\x27ED #\x27EF #\x2984 #\x2986 #\x2988 + #\x298A #\x298C #\x298E #\x2990 #\x2992 #\x2994 #\x2996 + #\x2998 #\x29D9 #\x29DB #\x29FD #\x2E23 #\x2E25 #\x2E27 + #\x2E29 #\x3009 #\x300B #\x300D #\x300F #\x3011 #\x3015 + #\x3017 #\x3019 #\x301B #\x301E #\x301F #\xFD3E #\xFE18 + #\xFE36 #\xFE38 #\xFE3A #\xFE3C #\xFE3E #\xFE40 #\xFE42 + #\xFE44 #\xFE48 #\xFE5A #\xFE5C #\xFE5E #\xFF09 #\xFF3D + #\xFF5D #\xFF60 #\xFF63) + + ;; Pf + (char-set #\x00BB #\x2019 #\x201D #\x203A #\x2E03 #\x2E05 #\x2E0A + #\x2E0D #\x2E1D #\x2E21) + + ;; Pi + (char-set #\x00AB #\x2018 #\x201B #\x201C #\x201F #\x2039 #\x2E02 + #\x2E04 #\x2E09 #\x2E0C #\x2E1C #\x2E20) + + ;; Po + (char-set #\x0021 #\x0022 #\x0023 #\x0025 #\x0026 #\x0027 #\x002A + #\x002C #\x002E #\x002F #\x003A #\x003B #\x003F #\x0040 + #\x005C #\x00A1 #\x00A7 #\x00B6 #\x00B7 #\x00BF #\x037E + #\x0387 #\x055A #\x055B #\x055C #\x055D #\x055E #\x055F + #\x0589 #\x05C0 #\x05C3 #\x05C6 #\x05F3 #\x05F4 #\x0609 + #\x060A #\x060C #\x060D #\x061B #\x061E #\x061F #\x066A + #\x066B #\x066C #\x066D #\x06D4 #\x0700 #\x0701 #\x0702 + #\x0703 #\x0704 #\x0705 #\x0706 #\x0707 #\x0708 #\x0709 + #\x070A #\x070B #\x070C #\x070D #\x07F7 #\x07F8 #\x07F9 + #\x0830 #\x0831 #\x0832 #\x0833 #\x0834 #\x0835 #\x0836 + #\x0837 #\x0838 #\x0839 #\x083A #\x083B #\x083C #\x083D + #\x083E #\x085E #\x0964 #\x0965 #\x0970 #\x09FD #\x0A76 + #\x0AF0 #\x0C77 #\x0C84 #\x0DF4 #\x0E4F #\x0E5A #\x0E5B + #\x0F04 #\x0F05 #\x0F06 #\x0F07 #\x0F08 #\x0F09 #\x0F0A + #\x0F0B #\x0F0C #\x0F0D #\x0F0E #\x0F0F #\x0F10 #\x0F11 + #\x0F12 #\x0F14 #\x0F85 #\x0FD0 #\x0FD1 #\x0FD2 #\x0FD3 + #\x0FD4 #\x0FD9 #\x0FDA #\x104A #\x104B #\x104C #\x104D + #\x104E #\x104F #\x10FB #\x1360 #\x1361 #\x1362 #\x1363 + #\x1364 #\x1365 #\x1366 #\x1367 #\x1368 #\x166E #\x16EB + #\x16EC #\x16ED #\x1735 #\x1736 #\x17D4 #\x17D5 #\x17D6 + #\x17D8 #\x17D9 #\x17DA #\x1800 #\x1801 #\x1802 #\x1803 + #\x1804 #\x1805 #\x1807 #\x1808 #\x1809 #\x180A #\x1944 + #\x1945 #\x1A1E #\x1A1F #\x1AA0 #\x1AA1 #\x1AA2 #\x1AA3 + #\x1AA4 #\x1AA5 #\x1AA6 #\x1AA8 #\x1AA9 #\x1AAA #\x1AAB + #\x1AAC #\x1AAD #\x1B5A #\x1B5B #\x1B5C #\x1B5D #\x1B5E + #\x1B5F #\x1B60 #\x1BFC #\x1BFD #\x1BFE #\x1BFF #\x1C3B + #\x1C3C #\x1C3D #\x1C3E #\x1C3F #\x1C7E #\x1C7F #\x1CC0 + #\x1CC1 #\x1CC2 #\x1CC3 #\x1CC4 #\x1CC5 #\x1CC6 #\x1CC7 + #\x1CD3 #\x2016 #\x2017 #\x2020 #\x2021 #\x2022 #\x2023 + #\x2024 #\x2025 #\x2026 #\x2027 #\x2030 #\x2031 #\x2032 + #\x2033 #\x2034 #\x2035 #\x2036 #\x2037 #\x2038 #\x203B + #\x203C #\x203D #\x203E #\x2041 #\x2042 #\x2043 #\x2047 + #\x2048 #\x2049 #\x204A #\x204B #\x204C #\x204D #\x204E + #\x204F #\x2050 #\x2051 #\x2053 #\x2055 #\x2056 #\x2057 + #\x2058 #\x2059 #\x205A #\x205B #\x205C #\x205D #\x205E + #\x2CF9 #\x2CFA #\x2CFB #\x2CFC #\x2CFE #\x2CFF #\x2D70 + #\x2E00 #\x2E01 #\x2E06 #\x2E07 #\x2E08 #\x2E0B #\x2E0E + #\x2E0F #\x2E10 #\x2E11 #\x2E12 #\x2E13 #\x2E14 #\x2E15 + #\x2E16 #\x2E18 #\x2E19 #\x2E1B #\x2E1E #\x2E1F #\x2E2A + #\x2E2B #\x2E2C #\x2E2D #\x2E2E #\x2E30 #\x2E31 #\x2E32 + #\x2E33 #\x2E34 #\x2E35 #\x2E36 #\x2E37 #\x2E38 #\x2E39 + #\x2E3C #\x2E3D #\x2E3E #\x2E3F #\x2E41 #\x2E43 #\x2E44 + #\x2E45 #\x2E46 #\x2E47 #\x2E48 #\x2E49 #\x2E4A #\x2E4B + #\x2E4C #\x2E4D #\x2E4E #\x2E4F #\x2E52 #\x3001 #\x3002 + #\x3003 #\x303D #\x30FB #\xA4FE #\xA4FF #\xA60D #\xA60E + #\xA60F #\xA673 #\xA67E #\xA6F2 #\xA6F3 #\xA6F4 #\xA6F5 + #\xA6F6 #\xA6F7 #\xA874 #\xA875 #\xA876 #\xA877 #\xA8CE + #\xA8CF #\xA8F8 #\xA8F9 #\xA8FA #\xA8FC #\xA92E #\xA92F + #\xA95F #\xA9C1 #\xA9C2 #\xA9C3 #\xA9C4 #\xA9C5 #\xA9C6 + #\xA9C7 #\xA9C8 #\xA9C9 #\xA9CA #\xA9CB #\xA9CC #\xA9CD + #\xA9DE #\x2D70 #\x2E00 #\x2E01 #\x2E06 #\x2E07 #\x2E08 + #\x2E0B #\x2E0E #\x2E0F #\x2E10 #\x2E11 #\x2E12 #\x2E13 + #\x2E14 #\x2E15 #\x2E16 #\x2E18 #\x2E19 #\x2E1B #\x2E1E + #\x2E1F #\x2E2A #\x2E2B #\x2E2C #\x2E2D #\x2E2E #\x2E30 + #\x2E31 #\x2E32 #\x2E33 #\x2E34 #\x2E35 #\x2E36 #\x2E37 + #\x2E38 #\x2E39 #\x2E3C #\x2E3D #\x2E3E #\x2E3F #\x2E41 + #\x2E43 #\x2E44 #\x2E45 #\x2E46 #\x2E47 #\x2E48 #\x2E49 + #\x2E4A #\x2E4B #\x2E4C #\x2E4D #\x2E4E #\x2E4F #\x2E52 + #\x3001 #\x3002 #\x3003 #\x303D #\x30FB #\xA4FE #\xA4FF + #\xA60D #\xA60E #\xA60F #\xA673 #\xA67E #\xA6F2 #\xA6F3 + #\xA6F4 #\xA6F5 #\xA6F6 #\xA6F7 #\xA874 #\xA875 #\xA876 + #\xA877 #\xA8CE #\xA8CF #\xA8F8 #\xA8F9 #\xA8FA #\xA8FC + #\xA92E #\xA92F #\xA95F #\xA9C1 #\xA9C2 #\xA9C3 #\xA9C4 + #\xA9C5 #\xA9C6 #\xA9C7 #\xA9C8 #\xA9C9 #\xA9CA #\xA9CB + #\xA9CC #\xA9CD #\xA9DE #\xA9DF #\xAA5C #\xAA5D #\xAA5E + #\xAA5F #\xAADE #\xAADF #\xAAF0 #\xAAF1 #\xABEB #\xFE10 + #\xFE11 #\xFE12 #\xFE13 #\xFE14 #\xFE15 #\xFE16 #\xFE19 + #\xFE30 #\xFE45 #\xFE46 #\xFE49 #\xFE4A #\xFE4B #\xFE4C + #\xFE50 #\xFE51 #\xFE52 #\xFE54 #\xFE55 #\xFE56 #\xFE57 + #\xFE5F #\xFE60 #\xFE61 #\xFE68 #\xFE6A #\xFE6B #\xFF01 + #\xFF02 #\xFF03 #\xFF05 #\xFF06 #\xFF07 #\xFF0A #\xFF0C + #\xFF0E #\xFF0F #\xFF1A #\xFF1B #\xFF1F #\xFF20 #\xFF3C + #\xFF61 #\xFF64 #\xFF65 #\x10100 #\x10101 #\x10102 #\x1039F + #\x103D0 #\x1056F #\x10857 #\x1091F #\x1093F #\x10A50 + #\x10A51 #\x10A52 #\x10A53 #\x10A54 #\x10A55 #\x10A56 + #\x10A57 #\x10A58 #\x10A7F #\x10AF0 #\x10AF1 #\x10AF2 + #\x10AF3 #\x10AF4 #\x10AF5 #\x10AF6 #\x10B39 #\x10B3A + #\x10B3B #\x10B3C #\x10B3D #\x10B3E #\x10B3F #\x10B99 + #\x10B9A #\x10B9B #\x10B9C #\x10F55 #\x10F56 #\x10F57 + #\x10F58 #\x10F59 #\x11047 #\x11048 #\xA9DF #\xAA5C #\xAA5D + #\xAA5E #\xAA5F #\xAADE #\xAADF #\xAAF0 #\xAAF1 #\xABEB + #\xFE10 #\xFE11 #\xFE12 #\xFE13 #\xFE14 #\xFE15 #\xFE16 + #\xFE19 #\xFE30 #\xFE45 #\xFE46 #\xFE49 #\xFE4A #\xFE4B + #\xFE4C #\xFE50 #\xFE51 #\xFE52 #\xFE54 #\xFE55 #\xFE56 + #\xFE57 #\xFE5F #\xFE60 #\xFE61 #\xFE68 #\xFE6A #\xFE6B + #\xFF01 #\xFF02 #\xFF03 #\xFF05 #\xFF06 #\xFF07 #\xFF0A + #\xFF0C #\xFF0E #\xFF0F #\xFF1A #\xFF1B #\xFF1F #\xFF20 + #\xFF3C #\xFF61 #\xFF64 #\xFF65 #\x10100 #\x10101 #\x10102 + #\x1039F #\x103D0 #\x1056F #\x10857 #\x1091F #\x1093F + #\x10A50 #\x10A51 #\x10A52 #\x10A53 #\x10A54 #\x10A55 + #\x10A56 #\x10A57 #\x10A58 #\x10A7F #\x10AF0 #\x10AF1 + #\x10AF2 #\x10AF3 #\x10AF4 #\x10AF5 #\x10AF6 #\x10B39 + #\x10B3A #\x10B3B #\x10B3C #\x10B3D #\x10B3E #\x10B3F + #\x10B99 #\x10B9A #\x10B9B #\x10B9C #\x10F55 #\x10F56 + #\x10F57 #\x10F58 #\x10F59 #\x11047 #\x11048 #\x11049 + #\x1104A #\x1104B #\x1104C #\x1104D #\x110BB #\x110BC + #\x110BE #\x110BF #\x110C0 #\x110C1 #\x11140 #\x11141 + #\x11142 #\x11143 #\x11174 #\x11175 #\x111C5 #\x111C6 + #\x111C7 #\x111C8 #\x111CD #\x111DB #\x111DD #\x111DE + #\x111DF #\x11238 #\x11239 #\x1123A #\x1123B #\x1123C + #\x1123D #\x112A9 #\x1144B #\x1144C #\x1144D #\x1144E + #\x1144F #\x1145A #\x1145B #\x1145D #\x114C6 #\x115C1 + #\x115C2 #\x115C3 #\x115C4 #\x115C5 #\x115C6 #\x115C7 + #\x115C8 #\x115C9 #\x115CA #\x115CB #\x115CC #\x115CD + #\x115CE #\x115CF #\x115D0 #\x115D1 #\x115D2 #\x115D3 + #\x115D4 #\x115D5 #\x115D6 #\x115D7 #\x11641 #\x11642 + #\x11643 #\x11660 #\x11661 #\x11662 #\x11663 #\x11664 + #\x11665 #\x11666 #\x11667 #\x11668 #\x11669 #\x1166A + #\x1166B #\x1166C #\x1173C #\x1173D #\x1173E #\x1183B + #\x11944 #\x11945 #\x11946 #\x119E2 #\x11A3F #\x11A40 + #\x11A41 #\x11A42 #\x11A43 #\x11A44 #\x11A45 #\x11A46 + #\x11A9A #\x11A9B #\x11A9C #\x11049 #\x1104A #\x1104B + #\x1104C #\x1104D #\x110BB #\x110BC #\x110BE #\x110BF + #\x110C0 #\x110C1 #\x11140 #\x11141 #\x11142 #\x11143 + #\x11174 #\x11175 #\x111C5 #\x111C6 #\x111C7 #\x111C8 + #\x111CD #\x111DB #\x111DD #\x111DE #\x111DF #\x11238 + #\x11239 #\x1123A #\x1123B #\x1123C #\x1123D #\x112A9 + #\x1144B #\x1144C #\x1144D #\x1144E #\x1144F #\x1145A + #\x1145B #\x1145D #\x114C6 #\x115C1 #\x115C2 #\x115C3 + #\x115C4 #\x115C5 #\x115C6 #\x115C7 #\x115C8 #\x115C9 + #\x115CA #\x115CB #\x115CC #\x115CD #\x115CE #\x115CF + #\x115D0 #\x115D1 #\x115D2 #\x115D3 #\x115D4 #\x115D5 + #\x115D6 #\x115D7 #\x11641 #\x11642 #\x11643 #\x11660 + #\x11661 #\x11662 #\x11663 #\x11664 #\x11665 #\x11666 + #\x11667 #\x11668 #\x11669 #\x1166A #\x1166B #\x1166C + #\x1173C #\x1173D #\x1173E #\x1183B #\x11944 #\x11945 + #\x11946 #\x119E2 #\x11A3F #\x11A40 #\x11A41 #\x11A42 + #\x11A43 #\x11A44 #\x11A45 #\x11A46 #\x11A9A #\x11A9B + #\x11A9C #\x11A9E #\x11A9F #\x11AA0 #\x11AA1 #\x11AA2 + #\x11C41 #\x11C42 #\x11C43 #\x11C44 #\x11C45 #\x11C70 + #\x11C71 #\x11EF7 #\x11EF8 #\x11FFF #\x12470 #\x12471 + #\x12472 #\x12473 #\x12474 #\x16A6E #\x16A6F #\x16AF5 + #\x16B37 #\x16B38 #\x16B39 #\x16B3A #\x16B3B #\x16B44 + #\x16E97 #\x16E98 #\x16E99 #\x16E9A #\x16FE2 #\x1BC9F + #\x1DA87 #\x1DA88 #\x1DA89 #\x1DA8A #\x1DA8B #\x1E95E + #\x1E95F #\x11A9E #\x11A9F #\x11AA0 #\x11AA1 #\x11AA2 + #\x11C41 #\x11C42 #\x11C43 #\x11C44 #\x11C45 #\x11C70 + #\x11C71 #\x11EF7 #\x11EF8 #\x11FFF #\x12470 #\x12471 + #\x12472 #\x12473 #\x12474 #\x16A6E #\x16A6F #\x16AF5 + #\x16B37 #\x16B38 #\x16B39 #\x16B3A #\x16B3B #\x16B44 + #\x16E97 #\x16E98 #\x16E99 #\x16E9A #\x16FE2 #\x1BC9F + #\x1DA87 #\x1DA88 #\x1DA89 #\x1DA8A #\x1DA8B #\x1E95E + #\x1E95F) + ;; Ps + (char-set #\x0028 #\x005B #\x007B #\x0F3A #\x0F3C #\x169B #\x201A + #\x201E #\x2045 #\x207D #\x208D #\x2308 #\x230A #\x2329 + #\x2768 #\x276A #\x276C #\x276E #\x2770 #\x2772 #\x2774 + #\x27C5 #\x27E6 #\x27E8 #\x27EA #\x27EC #\x27EE #\x2983 + #\x2985 #\x2987 #\x2989 #\x298B #\x298D #\x298F #\x2991 + #\x2993 #\x2995 #\x2997 #\x29D8 #\x29DA #\x29FC #\x2E22 + #\x2E24 #\x2E26 #\x2E28 #\x2E42 #\x3008 #\x300A #\x300C + #\x300E #\x3010 #\x3014 #\x3016 #\x3018 #\x301A #\x301D + #\xFD3F #\xFE17 #\xFE35 #\xFE37 #\xFE39 #\xFE3B #\xFE3D + #\xFE3F #\xFE41 #\xFE43 #\xFE47 #\xFE59 #\xFE5B #\xFE5D + #\xFF08 #\xFF3B #\xFF5B #\xFF5F #\xFF62))))) diff --git a/tests/parse.scm b/tests/parse.scm index bd7e494..4e036b3 100644 --- a/tests/parse.scm +++ b/tests/parse.scm @@ -252,7 +252,7 @@ (test "testSingleWordTimerWithPunctuation" '(recipe (metadata ()) - ((step ("Let it " (timer "rest" #f) ", after plating")))) + ((step ("Let it " (timer "rest" #f) ", then serve")))) (cook->list (parse-cook "Let it ~rest, then serve\n"))) (test "testSingleWordTimerWithUnicodePunctuation" @@ -283,7 +283,7 @@ (test "testSingleWordIngredientWithUnicodePunctuation" '(recipe (metadata ()) - ((step ("Add some " (ingredient "chilli" #f) "⸫ then bake")))) + ((step ("Add " (ingredient "chilli" #f) "⸫ then bake")))) (cook->list (parse-cook "Add @chilli⸫ then bake\n"))) ; NOTE: the space is U+2009 -- cgit v1.2.3