From 681036ddc3891904f13c84197cd90e2472cb3e1c Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Fri, 23 Jul 2021 20:49:14 +0200 Subject: Change folder name --- pyscv/Frontend/lexer.py | 292 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 292 insertions(+) create mode 100644 pyscv/Frontend/lexer.py (limited to 'pyscv/Frontend/lexer.py') diff --git a/pyscv/Frontend/lexer.py b/pyscv/Frontend/lexer.py new file mode 100644 index 0000000..f7ca18a --- /dev/null +++ b/pyscv/Frontend/lexer.py @@ -0,0 +1,292 @@ +# TODO Logging is interesting for debugging purposes, decide what to do with it +from .logger import newlogger +logger = newlogger(__name__) +#import logging +#logger.setLevel(logging.WARN) + + +###### + +from enum import Enum +from .reader import Reader + +binChars = set("01") +octChars = set("01234567") +decChars = set("0123456789") +hexChars = set("0123456789ABCDEFabcdef") + +class TokenType(Enum): + """ + These are the possible tokens that the lexer knows, they are converted to + their most accurate representation in python. + """ + identifier = 0 # Represented as strings + label = 1 # Represented as strings or integer if they are numeric + instruction = 2 # Represented as strings + directive = 3 # Represented as strings + integer = 4 # Represented as int + character = 5 # Represented as strings of length 1 + string = 6 # Represented as strings + float = 7 # Represented as ?? + # (we need correct conversions to the binary value) + # TODO + end = 8 + argsep = 9 + openparens = 10 + closeparens = 11 + +class Lexer: + def __init__(self, reader): + self.reader = reader + self.tokenstart = 0 + + def __iter__(self): + return self + + def __next__(self): + if self.reader.char is None: + self.reader.advance() + while True: + try: + # Instruction end + if self.reader.char == "\n" or self.reader.char == ";": + self.reader.advance() + return (TokenType.end, None) + + # Spaces + elif self.reader.char.isspace(): + self.reader.advance() + + # Argument separator + elif self.reader.char == ",": + self.reader.advance() + return (TokenType.argsep, None) + + # String + elif self.reader.char == '"': + return self.string() + + # Character + elif self.reader.char == "'": + return self.character() + + # Comment + elif self.reader.char == "#": + self.comment() + + # Parenthesis + # Load a register as an address + an offset + elif self.reader.char == '(': + self.reader.advance() + return (TokenType.openparens, None) + elif self.reader.char == ')': + self.reader.advance() + return (TokenType.closeparens, None) + + # Starts with digit: + # - Numbers (any kind) + # - Numeric Labels + # - Numeric Label references + # - Load register as address: ld a1, 4(a0) + # ^^^^^ + # offset + reg + elif self.reader.char.isdigit() or self.reader.char == "-": + return self.number() + + # Identifiers or labels + elif self.reader.char.isalpha() or self.reader.char == "_": + return self.identifier() + + # Directive + elif self.reader.char == "." and self.reader.peek().isalpha(): + return self.directive() + + elif self.reader.char == "": + break # FILE END + + else: + raise ValueError("Don't know how to lex") + + except Exception as e: + # Handle exceptions + # raise StopIteration + raise e + raise StopIteration + + def string (self): + logger.info("Found string") + self.reader.advance() # Ignore opening quotes + + string = "" + escaped = False # Set if previous character was a backslash + + while self.reader.char != '"' or escaped: + if self.reader.char == "": + # TODO: Check how to do this + raise ValueError("Error: string not closed, found EOF") + if self.reader.char == "\n": + # TODO: Consider the string as closed and continue but report + # the error? + raise ValueError("Error: string not closed, found newline") + + if escaped: + string += self.escaped_char(self.reader.char) + logger.debug("Escape sequence processed %s", + string[-1].__repr__()) + escaped = False + continue + if self.reader.char == "\\": + escaped = True + self.reader.advance() + continue + escaped = False + string += self.reader.char + self.reader.advance() + logger.info("Lexed string %s", string.__repr__()) + self.reader.advance() # Discard closing " + return (TokenType.string, string) + + def character (self): + logger.info("Found character") + self.reader.advance() # Ignore the opening quote + character = self.reader.char + if character == "": + # TODO: Check how to do this + raise ValueError("Error: found EOF") + if not character.isprintable() : + raise ValueError("Error: Non printable character") + + if character == "\\": + self.reader.advance() + character = self.escaped_char(self.reader.char) + logger.debug("Escape sequence processed: %s", character.__repr__()) + + # Make sure it's correctly closed + self.reader.advance() + if self.reader.char != "'": + raise ValueError("Parse error: expected `'`, found " + self.reader.char) + self.reader.advance() # Discard closing ' + + logger.info("Lexed char %s", character.__repr__()) + return (TokenType.character, character) + + def comment(self): + while self.reader != "\n": + self.reader.advance() + + def escaped_char(self, ch): + if ch == '"': + return '"' + elif ch == 'n': + return '\n' + elif ch == 't': + return '\t' + elif ch == '\\': + return '\\' + else: + # TODO: implement more escape sequences + return "" + + + def number(self): + """ + Process anything that starts with a number. Could be: + - An actual number in hex, octal, binary or decimal + - Numeric labels or numeric label references + - floating point (not implemented yet) + """ + numbstr = self.reader.char + self.reader.advance() + + logger.info("Found number: %s", numbstr) + + # Hex, Bin, Oct and the 0 + if numbstr == "0": + reprid = self.reader.peek() + if reprid == "x": + self.reader.advance() + return self.hex() + elif reprid == "b": + self.reader.advance() + return self.bin() + elif reprid.isdigit(): + return self.oct() + else: + return (TokenType.integer, 0) + + # TODO: floating point numbers + + # Decimal number + # consume a decimal number + while self.reader.char in decChars: + numbstr += self.reader.char + self.reader.advance() + + # Numeric label reference + if self.reader.char in set("bf"): + numbstr += self.reader.char + self.reader.advance() + return (TokenType.identifier, numbstr) + # TODO: Return identifier or label? + # It's a label-ref not a label! + + # Numeric label + if self.reader.char in ":": + self.reader.advance() + return (TokenType.label, numbstr) + + return (TokenType.integer, int(numbstr)) + + def hex(self): + # TODO: Handle possible errors + numbstr = "" + while self.reader.char in hexChars: + numbstr += self.reader.char + self.reader.advance() + return (TokenType.integer, int(numbstr, 16)) + + def oct(self): + # TODO: Handle possible errors + numbstr = "" + while self.reader.char in octChars: + numbstr += self.reader.char + self.reader.advance() + return (TokenType.integer, int(numbstr, 8)) + + def bin(self): + # TODO: Handle possible errors + numbstr = "" + while self.reader.char in binChars: + numbstr += self.reader.char + self.reader.advance() + return (TokenType.integer, int(numbstr, 2)) + + + def identifier(self): + logger.info("Found identifier") + s = "" + while self.reader.char.isalnum() or self.reader.char == "_": + s += self.reader.char + self.reader.advance() + + if self.reader.char == ":": + self.reader.advance() + return (TokenType.label, s) + else: + return (TokenType.identifier, s) + + def directive(self): + logger.info("Found directive") + s = "" + self.reader.advance() # Discard leading dot + while self.reader.char.isalnum() or self.reader.char == "_": + s += self.reader.char + self.reader.advance() + return (TokenType.directive, s) + +if __name__ == "__main__": + import sys + with Reader(sys.argv[1]) as src: + lexer = Lexer(src) + for token in lexer: + print(token) -- cgit v1.2.3