From f6886876603b709d0c899a86581e867b9e061337 Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Thu, 6 May 2021 22:25:04 +0200 Subject: Work randomly on the frontend --- pysc-v/Frontend/lexer.py | 273 +++++++++++++++++++++++++++++++++++++++++++++- pysc-v/Frontend/logger.py | 10 ++ pysc-v/Frontend/parser.py | 1 + pysc-v/Frontend/reader.py | 33 ++++-- 4 files changed, 305 insertions(+), 12 deletions(-) create mode 100644 pysc-v/Frontend/logger.py create mode 100644 pysc-v/Frontend/parser.py diff --git a/pysc-v/Frontend/lexer.py b/pysc-v/Frontend/lexer.py index 5175097..c4a9a7d 100644 --- a/pysc-v/Frontend/lexer.py +++ b/pysc-v/Frontend/lexer.py @@ -1,4 +1,269 @@ -# Returns: -# - label -# - instruction (with args) -# - directive (with args) +# TODO Logging is interesting for debugging purposes, decide what to do with it +from logger import newlogger +logger = newlogger(__name__) +#import logging +#logger.setLevel(logging.WARN) + + +###### + +from enum import Enum +from reader import Reader + +binChars = set("01") +octChars = set("01234567") +decChars = set("0123456789") +hexChars = set("0123456789ABCDEFabcdef") + +class TokenType(Enum): + """ + These are the possible tokens that the lexer knows, they are converted to + their most accurate representation in python. + """ + identifier = 0 # Represented as strings + label = 1 # Represented as strings or integer if they are numeric + instruction = 2 # Represented as strings + directive = 3 # Represented as strings + integer = 4 # Represented as int + character = 5 # Represented as strings of length 1 + string = 6 # Represented as strings + float = 7 # Represented as ?? + # (we need correct conversions to the binary value) + # TODO + end = 8 + argsep = 9 + openparens = 10 + closeparens = 11 + +class Lexer: + def __init__(self, reader): + self.reader = reader + self.tokenstart = 0 + + def __iter__(self): + return self + + def __next__(self): + if self.reader.char is None: + self.reader.advance() + while True: + try: + # Instruction end + if self.reader.char == "\n" or self.reader.char == ";": + self.reader.advance() + return (TokenType.end, None) + + # Argument separator + elif self.reader.char == ",": + self.reader.advance() + return (TokenType.argsep, None) + + # String + elif self.reader.char == '"': + return self.string() + + # Character + elif self.reader.char == "'": + return self.character() + + # Comment + elif self.reader.char == "#": + self.comment() + + # Parenthesis + # Load a register as an address + an offset + elif self.reader.char == '(': + self.reader.advance() + return (TokenType.openparens, None) + elif self.reader.char == ')': + self.reader.advance() + return (TokenType.closeparens, None) + + # Starts with digit: + # - Numbers (any kind) + # - Numeric Labels + # - Numeric Label references + # - Load register as address: ld a1, 4(a0) + # ^^^^^ + # offset + reg + elif self.reader.char.isdigit() or self.reader.char == "-": + return self.number() + + # Identifiers + elif self.reader.char.isalpha(): + return self.identifier() + + elif self.reader.char == "": + break # FILE END + + else: + # TODO: Remove this, it's just for testing + self.reader.advance() + except Exception as e: + # Handle exceptions + # raise StopIteration + raise e + raise StopIteration + + def string (self): + logger.info("Found string") + self.reader.advance() # Ignore opening quotes + + string = "" + escaped = False # Set if previous character was a backslash + + while self.reader.char != '"' or escaped: + if self.reader.char == "": + # TODO: Check how to do this + raise ValueError("Error: string not closed, found EOF") + if self.reader.char == "\n": + # TODO: Consider the string as closed and continue but report + # the error? + raise ValueError("Error: string not closed, found newline") + + if escaped: + string += self.escaped_char(self.reader.char) + logger.debug("Escape sequence processed %s", + string[-1].__repr__()) + escaped = False + continue + if self.reader.char == "\\": + escaped = True + self.reader.advance() + continue + escaped = False + string += self.reader.char + self.reader.advance() + logger.info("Lexed string %s", string.__repr__()) + self.reader.advance() # Discard closing " + return (TokenType.string, string) + + def character (self): + logger.info("Found character") + self.reader.advance() # Ignore the opening quote + character = self.reader.char + if character == "": + # TODO: Check how to do this + raise ValueError("Error: found EOF") + if not character.isprintable() : + raise ValueError("Error: Non printable character") + + if character == "\\": + self.reader.advance() + character = self.escaped_char(self.reader.char) + logger.debug("Escape sequence processed: %s", character.__repr__()) + + # Make sure it's correctly closed + self.reader.advance() + if self.reader.char != "'": + raise ValueError("Parse error: expected `'`, found " + self.reader.char) + self.reader.advance() # Discard closing ' + + logger.info("Lexed char %s", character.__repr__()) + return (TokenType.character, character) + + def comment(self): + while self.reader != "\n": + self.reader.advance() + + def escaped_char(self, ch): + if ch == '"': + return '"' + elif ch == 'n': + return '\n' + elif ch == 't': + return '\t' + elif ch == '\\': + return '\\' + else: + # TODO: implement more escape sequences + return "" + + + def number(self): + """ + Process anything that starts with a number. Could be: + - An actual number in hex, octal, binary or decimal + - Numeric labels or numeric label references + - floating point (not implemented yet) + """ + numbstr = self.reader.char + self.reader.advance() + + logger.info("Found number: %s", numbstr) + + # Hex, Bin, Oct and the 0 + if numbstr == "0": + reprid = self.reader.peek() + if reprid == "x": + self.reader.advance() + return self.hex() + elif reprid == "b": + self.reader.advance() + return self.bin() + elif reprid.isdigit(): + return self.oct() + else: + return (TokenType.integer, 0) + + # TODO: floating point numbers + + # Decimal number + # consume a decimal number + while self.reader.char in decChars: + numbstr += self.reader.char + self.reader.advance() + + # Numeric label reference + if self.reader.char in set("bf"): + numbstr += self.reader.char + self.reader.advance() + return (TokenType.identifier, numbstr) + # TODO: Return identifier or label? + # It's a label-ref not a label! + + # Numeric label + if self.reader.char in ":": + self.reader.advance() + return (TokenType.label, numbstr) + + return (TokenType.integer, int(numbstr)) + + def hex(self): + # TODO: Handle possible errors + numbstr = "" + while self.reader.char in hexChars: + numbstr += self.reader.char + self.reader.advance() + return (TokenType.integer, int(numbstr, 16)) + + def oct(self): + # TODO: Handle possible errors + numbstr = "" + while self.reader.char in octChars: + numbstr += self.reader.char + self.reader.advance() + return (TokenType.integer, int(numbstr, 8)) + + def bin(self): + # TODO: Handle possible errors + numbstr = "" + while self.reader.char in binChars: + numbstr += self.reader.char + self.reader.advance() + return (TokenType.integer, int(numbstr, 2)) + + + def identifier(self): + s = "" + while self.reader.char.isalnum() or self.reader.char == "_": + s += self.reader.char + self.reader.advance() + return (TokenType.identifier, s) + +if __name__ == "__main__": + import sys + with Reader(sys.argv[1]) as src: + lexer = Lexer(src) + for token in lexer: + print(token) diff --git a/pysc-v/Frontend/logger.py b/pysc-v/Frontend/logger.py new file mode 100644 index 0000000..a0f76e1 --- /dev/null +++ b/pysc-v/Frontend/logger.py @@ -0,0 +1,10 @@ +import logging +import sys + +logging.basicConfig() + + +def newlogger(name): + logger = logging.getLogger(name) + logger.setLevel(level=logging.DEBUG) + return logger diff --git a/pysc-v/Frontend/parser.py b/pysc-v/Frontend/parser.py new file mode 100644 index 0000000..b5acfbb --- /dev/null +++ b/pysc-v/Frontend/parser.py @@ -0,0 +1 @@ +from InstructionSets import RV32I, RV32C, RV32F, RV32D diff --git a/pysc-v/Frontend/reader.py b/pysc-v/Frontend/reader.py index 45d155c..7694a94 100644 --- a/pysc-v/Frontend/reader.py +++ b/pysc-v/Frontend/reader.py @@ -1,24 +1,40 @@ +# TODO Logging is interesting for debugging purposes, decide what to do with it +from logger import newlogger +logger = newlogger(__name__) +#import logging +#logger.setLevel(logging.WARN) + class Reader: def __init__(self, filename): self.filename = filename self._file = None self.lineno = 0 self.charno = 0 + self.char = None - def __enter__(self): + def open(self): self._file = open(self.filename, "r") + return + def close(self): + self._file.close() + return + + def __enter__(self): + self.open() return self def __exit__(self, type, value, tb): - self._file.close() + self.close() - def char(self): - ch = self._file.read(1) + def advance(self): self.charno += 1 - if ch == "\n": + if self.char == "\n": self.lineno += 1 self.charno = 0 - return ch + self.char = self._file.read(1) + + logger.debug("Read: %s" % self.char.__repr__()) + return def peek(self): pos = self._file.tell() @@ -28,7 +44,8 @@ class Reader: if __name__ == "__main__": - with Reader("reader.py") as src: + with Reader(__file__ ) as src: print(src.peek()) while src.peek() != "": - print(src.char()) + src.advance() + print(src.char) -- cgit v1.2.3