# TODO Logging is interesting for debugging purposes, decide what to do with it from logger import newlogger logger = newlogger(__name__) #import logging #logger.setLevel(logging.WARN) ###### from enum import Enum from reader import Reader binChars = set("01") octChars = set("01234567") decChars = set("0123456789") hexChars = set("0123456789ABCDEFabcdef") class TokenType(Enum): """ These are the possible tokens that the lexer knows, they are converted to their most accurate representation in python. """ identifier = 0 # Represented as strings label = 1 # Represented as strings or integer if they are numeric instruction = 2 # Represented as strings directive = 3 # Represented as strings integer = 4 # Represented as int character = 5 # Represented as strings of length 1 string = 6 # Represented as strings float = 7 # Represented as ?? # (we need correct conversions to the binary value) # TODO end = 8 argsep = 9 openparens = 10 closeparens = 11 class Lexer: def __init__(self, reader): self.reader = reader self.tokenstart = 0 def __iter__(self): return self def __next__(self): if self.reader.char is None: self.reader.advance() while True: try: # Instruction end if self.reader.char == "\n" or self.reader.char == ";": self.reader.advance() return (TokenType.end, None) # Argument separator elif self.reader.char == ",": self.reader.advance() return (TokenType.argsep, None) # String elif self.reader.char == '"': return self.string() # Character elif self.reader.char == "'": return self.character() # Comment elif self.reader.char == "#": self.comment() # Parenthesis # Load a register as an address + an offset elif self.reader.char == '(': self.reader.advance() return (TokenType.openparens, None) elif self.reader.char == ')': self.reader.advance() return (TokenType.closeparens, None) # Starts with digit: # - Numbers (any kind) # - Numeric Labels # - Numeric Label references # - Load register as address: ld a1, 4(a0) # ^^^^^ # offset + reg elif self.reader.char.isdigit() or self.reader.char == "-": return self.number() # Identifiers elif self.reader.char.isalpha(): return self.identifier() elif self.reader.char == "": break # FILE END else: # TODO: Remove this, it's just for testing self.reader.advance() except Exception as e: # Handle exceptions # raise StopIteration raise e raise StopIteration def string (self): logger.info("Found string") self.reader.advance() # Ignore opening quotes string = "" escaped = False # Set if previous character was a backslash while self.reader.char != '"' or escaped: if self.reader.char == "": # TODO: Check how to do this raise ValueError("Error: string not closed, found EOF") if self.reader.char == "\n": # TODO: Consider the string as closed and continue but report # the error? raise ValueError("Error: string not closed, found newline") if escaped: string += self.escaped_char(self.reader.char) logger.debug("Escape sequence processed %s", string[-1].__repr__()) escaped = False continue if self.reader.char == "\\": escaped = True self.reader.advance() continue escaped = False string += self.reader.char self.reader.advance() logger.info("Lexed string %s", string.__repr__()) self.reader.advance() # Discard closing " return (TokenType.string, string) def character (self): logger.info("Found character") self.reader.advance() # Ignore the opening quote character = self.reader.char if character == "": # TODO: Check how to do this raise ValueError("Error: found EOF") if not character.isprintable() : raise ValueError("Error: Non printable character") if character == "\\": self.reader.advance() character = self.escaped_char(self.reader.char) logger.debug("Escape sequence processed: %s", character.__repr__()) # Make sure it's correctly closed self.reader.advance() if self.reader.char != "'": raise ValueError("Parse error: expected `'`, found " + self.reader.char) self.reader.advance() # Discard closing ' logger.info("Lexed char %s", character.__repr__()) return (TokenType.character, character) def comment(self): while self.reader != "\n": self.reader.advance() def escaped_char(self, ch): if ch == '"': return '"' elif ch == 'n': return '\n' elif ch == 't': return '\t' elif ch == '\\': return '\\' else: # TODO: implement more escape sequences return "" def number(self): """ Process anything that starts with a number. Could be: - An actual number in hex, octal, binary or decimal - Numeric labels or numeric label references - floating point (not implemented yet) """ numbstr = self.reader.char self.reader.advance() logger.info("Found number: %s", numbstr) # Hex, Bin, Oct and the 0 if numbstr == "0": reprid = self.reader.peek() if reprid == "x": self.reader.advance() return self.hex() elif reprid == "b": self.reader.advance() return self.bin() elif reprid.isdigit(): return self.oct() else: return (TokenType.integer, 0) # TODO: floating point numbers # Decimal number # consume a decimal number while self.reader.char in decChars: numbstr += self.reader.char self.reader.advance() # Numeric label reference if self.reader.char in set("bf"): numbstr += self.reader.char self.reader.advance() return (TokenType.identifier, numbstr) # TODO: Return identifier or label? # It's a label-ref not a label! # Numeric label if self.reader.char in ":": self.reader.advance() return (TokenType.label, numbstr) return (TokenType.integer, int(numbstr)) def hex(self): # TODO: Handle possible errors numbstr = "" while self.reader.char in hexChars: numbstr += self.reader.char self.reader.advance() return (TokenType.integer, int(numbstr, 16)) def oct(self): # TODO: Handle possible errors numbstr = "" while self.reader.char in octChars: numbstr += self.reader.char self.reader.advance() return (TokenType.integer, int(numbstr, 8)) def bin(self): # TODO: Handle possible errors numbstr = "" while self.reader.char in binChars: numbstr += self.reader.char self.reader.advance() return (TokenType.integer, int(numbstr, 2)) def identifier(self): s = "" while self.reader.char.isalnum() or self.reader.char == "_": s += self.reader.char self.reader.advance() return (TokenType.identifier, s) if __name__ == "__main__": import sys with Reader(sys.argv[1]) as src: lexer = Lexer(src) for token in lexer: print(token)