diff options
author | Ekaitz Zarraga <ekaitz@elenq.tech> | 2021-07-23 20:49:14 +0200 |
---|---|---|
committer | Ekaitz Zarraga <ekaitz@elenq.tech> | 2021-07-23 20:49:14 +0200 |
commit | 681036ddc3891904f13c84197cd90e2472cb3e1c (patch) | |
tree | 080b9580c14274698d1848e7640cba0cc47a53a6 /pysc-v/Frontend/lexer.py | |
parent | 309d36182ef32a1bc5bff84f39e9e81db0ddb9a6 (diff) |
Diffstat (limited to 'pysc-v/Frontend/lexer.py')
-rw-r--r-- | pysc-v/Frontend/lexer.py | 269 |
1 files changed, 0 insertions, 269 deletions
diff --git a/pysc-v/Frontend/lexer.py b/pysc-v/Frontend/lexer.py deleted file mode 100644 index c4a9a7d..0000000 --- a/pysc-v/Frontend/lexer.py +++ /dev/null @@ -1,269 +0,0 @@ -# TODO Logging is interesting for debugging purposes, decide what to do with it -from logger import newlogger -logger = newlogger(__name__) -#import logging -#logger.setLevel(logging.WARN) - - -###### - -from enum import Enum -from reader import Reader - -binChars = set("01") -octChars = set("01234567") -decChars = set("0123456789") -hexChars = set("0123456789ABCDEFabcdef") - -class TokenType(Enum): - """ - These are the possible tokens that the lexer knows, they are converted to - their most accurate representation in python. - """ - identifier = 0 # Represented as strings - label = 1 # Represented as strings or integer if they are numeric - instruction = 2 # Represented as strings - directive = 3 # Represented as strings - integer = 4 # Represented as int - character = 5 # Represented as strings of length 1 - string = 6 # Represented as strings - float = 7 # Represented as ?? - # (we need correct conversions to the binary value) - # TODO - end = 8 - argsep = 9 - openparens = 10 - closeparens = 11 - -class Lexer: - def __init__(self, reader): - self.reader = reader - self.tokenstart = 0 - - def __iter__(self): - return self - - def __next__(self): - if self.reader.char is None: - self.reader.advance() - while True: - try: - # Instruction end - if self.reader.char == "\n" or self.reader.char == ";": - self.reader.advance() - return (TokenType.end, None) - - # Argument separator - elif self.reader.char == ",": - self.reader.advance() - return (TokenType.argsep, None) - - # String - elif self.reader.char == '"': - return self.string() - - # Character - elif self.reader.char == "'": - return self.character() - - # Comment - elif self.reader.char == "#": - self.comment() - - # Parenthesis - # Load a register as an address + an offset - elif self.reader.char == '(': - self.reader.advance() - return (TokenType.openparens, None) - elif self.reader.char == ')': - self.reader.advance() - return (TokenType.closeparens, None) - - # Starts with digit: - # - Numbers (any kind) - # - Numeric Labels - # - Numeric Label references - # - Load register as address: ld a1, 4(a0) - # ^^^^^ - # offset + reg - elif self.reader.char.isdigit() or self.reader.char == "-": - return self.number() - - # Identifiers - elif self.reader.char.isalpha(): - return self.identifier() - - elif self.reader.char == "": - break # FILE END - - else: - # TODO: Remove this, it's just for testing - self.reader.advance() - except Exception as e: - # Handle exceptions - # raise StopIteration - raise e - raise StopIteration - - def string (self): - logger.info("Found string") - self.reader.advance() # Ignore opening quotes - - string = "" - escaped = False # Set if previous character was a backslash - - while self.reader.char != '"' or escaped: - if self.reader.char == "": - # TODO: Check how to do this - raise ValueError("Error: string not closed, found EOF") - if self.reader.char == "\n": - # TODO: Consider the string as closed and continue but report - # the error? - raise ValueError("Error: string not closed, found newline") - - if escaped: - string += self.escaped_char(self.reader.char) - logger.debug("Escape sequence processed %s", - string[-1].__repr__()) - escaped = False - continue - if self.reader.char == "\\": - escaped = True - self.reader.advance() - continue - escaped = False - string += self.reader.char - self.reader.advance() - logger.info("Lexed string %s", string.__repr__()) - self.reader.advance() # Discard closing " - return (TokenType.string, string) - - def character (self): - logger.info("Found character") - self.reader.advance() # Ignore the opening quote - character = self.reader.char - if character == "": - # TODO: Check how to do this - raise ValueError("Error: found EOF") - if not character.isprintable() : - raise ValueError("Error: Non printable character") - - if character == "\\": - self.reader.advance() - character = self.escaped_char(self.reader.char) - logger.debug("Escape sequence processed: %s", character.__repr__()) - - # Make sure it's correctly closed - self.reader.advance() - if self.reader.char != "'": - raise ValueError("Parse error: expected `'`, found " + self.reader.char) - self.reader.advance() # Discard closing ' - - logger.info("Lexed char %s", character.__repr__()) - return (TokenType.character, character) - - def comment(self): - while self.reader != "\n": - self.reader.advance() - - def escaped_char(self, ch): - if ch == '"': - return '"' - elif ch == 'n': - return '\n' - elif ch == 't': - return '\t' - elif ch == '\\': - return '\\' - else: - # TODO: implement more escape sequences - return "" - - - def number(self): - """ - Process anything that starts with a number. Could be: - - An actual number in hex, octal, binary or decimal - - Numeric labels or numeric label references - - floating point (not implemented yet) - """ - numbstr = self.reader.char - self.reader.advance() - - logger.info("Found number: %s", numbstr) - - # Hex, Bin, Oct and the 0 - if numbstr == "0": - reprid = self.reader.peek() - if reprid == "x": - self.reader.advance() - return self.hex() - elif reprid == "b": - self.reader.advance() - return self.bin() - elif reprid.isdigit(): - return self.oct() - else: - return (TokenType.integer, 0) - - # TODO: floating point numbers - - # Decimal number - # consume a decimal number - while self.reader.char in decChars: - numbstr += self.reader.char - self.reader.advance() - - # Numeric label reference - if self.reader.char in set("bf"): - numbstr += self.reader.char - self.reader.advance() - return (TokenType.identifier, numbstr) - # TODO: Return identifier or label? - # It's a label-ref not a label! - - # Numeric label - if self.reader.char in ":": - self.reader.advance() - return (TokenType.label, numbstr) - - return (TokenType.integer, int(numbstr)) - - def hex(self): - # TODO: Handle possible errors - numbstr = "" - while self.reader.char in hexChars: - numbstr += self.reader.char - self.reader.advance() - return (TokenType.integer, int(numbstr, 16)) - - def oct(self): - # TODO: Handle possible errors - numbstr = "" - while self.reader.char in octChars: - numbstr += self.reader.char - self.reader.advance() - return (TokenType.integer, int(numbstr, 8)) - - def bin(self): - # TODO: Handle possible errors - numbstr = "" - while self.reader.char in binChars: - numbstr += self.reader.char - self.reader.advance() - return (TokenType.integer, int(numbstr, 2)) - - - def identifier(self): - s = "" - while self.reader.char.isalnum() or self.reader.char == "_": - s += self.reader.char - self.reader.advance() - return (TokenType.identifier, s) - -if __name__ == "__main__": - import sys - with Reader(sys.argv[1]) as src: - lexer = Lexer(src) - for token in lexer: - print(token) |