summaryrefslogtreecommitdiff
path: root/pyscv/Frontend/lexer.py
diff options
context:
space:
mode:
Diffstat (limited to 'pyscv/Frontend/lexer.py')
-rw-r--r--pyscv/Frontend/lexer.py292
1 files changed, 292 insertions, 0 deletions
diff --git a/pyscv/Frontend/lexer.py b/pyscv/Frontend/lexer.py
new file mode 100644
index 0000000..f7ca18a
--- /dev/null
+++ b/pyscv/Frontend/lexer.py
@@ -0,0 +1,292 @@
+# TODO Logging is interesting for debugging purposes, decide what to do with it
+from .logger import newlogger
+logger = newlogger(__name__)
+#import logging
+#logger.setLevel(logging.WARN)
+
+
+######
+
+from enum import Enum
+from .reader import Reader
+
+binChars = set("01")
+octChars = set("01234567")
+decChars = set("0123456789")
+hexChars = set("0123456789ABCDEFabcdef")
+
+class TokenType(Enum):
+ """
+ These are the possible tokens that the lexer knows, they are converted to
+ their most accurate representation in python.
+ """
+ identifier = 0 # Represented as strings
+ label = 1 # Represented as strings or integer if they are numeric
+ instruction = 2 # Represented as strings
+ directive = 3 # Represented as strings
+ integer = 4 # Represented as int
+ character = 5 # Represented as strings of length 1
+ string = 6 # Represented as strings
+ float = 7 # Represented as ??
+ # (we need correct conversions to the binary value)
+ # TODO
+ end = 8
+ argsep = 9
+ openparens = 10
+ closeparens = 11
+
+class Lexer:
+ def __init__(self, reader):
+ self.reader = reader
+ self.tokenstart = 0
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ if self.reader.char is None:
+ self.reader.advance()
+ while True:
+ try:
+ # Instruction end
+ if self.reader.char == "\n" or self.reader.char == ";":
+ self.reader.advance()
+ return (TokenType.end, None)
+
+ # Spaces
+ elif self.reader.char.isspace():
+ self.reader.advance()
+
+ # Argument separator
+ elif self.reader.char == ",":
+ self.reader.advance()
+ return (TokenType.argsep, None)
+
+ # String
+ elif self.reader.char == '"':
+ return self.string()
+
+ # Character
+ elif self.reader.char == "'":
+ return self.character()
+
+ # Comment
+ elif self.reader.char == "#":
+ self.comment()
+
+ # Parenthesis
+ # Load a register as an address + an offset
+ elif self.reader.char == '(':
+ self.reader.advance()
+ return (TokenType.openparens, None)
+ elif self.reader.char == ')':
+ self.reader.advance()
+ return (TokenType.closeparens, None)
+
+ # Starts with digit:
+ # - Numbers (any kind)
+ # - Numeric Labels
+ # - Numeric Label references
+ # - Load register as address: ld a1, 4(a0)
+ # ^^^^^
+ # offset + reg
+ elif self.reader.char.isdigit() or self.reader.char == "-":
+ return self.number()
+
+ # Identifiers or labels
+ elif self.reader.char.isalpha() or self.reader.char == "_":
+ return self.identifier()
+
+ # Directive
+ elif self.reader.char == "." and self.reader.peek().isalpha():
+ return self.directive()
+
+ elif self.reader.char == "":
+ break # FILE END
+
+ else:
+ raise ValueError("Don't know how to lex")
+
+ except Exception as e:
+ # Handle exceptions
+ # raise StopIteration
+ raise e
+ raise StopIteration
+
+ def string (self):
+ logger.info("Found string")
+ self.reader.advance() # Ignore opening quotes
+
+ string = ""
+ escaped = False # Set if previous character was a backslash
+
+ while self.reader.char != '"' or escaped:
+ if self.reader.char == "":
+ # TODO: Check how to do this
+ raise ValueError("Error: string not closed, found EOF")
+ if self.reader.char == "\n":
+ # TODO: Consider the string as closed and continue but report
+ # the error?
+ raise ValueError("Error: string not closed, found newline")
+
+ if escaped:
+ string += self.escaped_char(self.reader.char)
+ logger.debug("Escape sequence processed %s",
+ string[-1].__repr__())
+ escaped = False
+ continue
+ if self.reader.char == "\\":
+ escaped = True
+ self.reader.advance()
+ continue
+ escaped = False
+ string += self.reader.char
+ self.reader.advance()
+ logger.info("Lexed string %s", string.__repr__())
+ self.reader.advance() # Discard closing "
+ return (TokenType.string, string)
+
+ def character (self):
+ logger.info("Found character")
+ self.reader.advance() # Ignore the opening quote
+ character = self.reader.char
+ if character == "":
+ # TODO: Check how to do this
+ raise ValueError("Error: found EOF")
+ if not character.isprintable() :
+ raise ValueError("Error: Non printable character")
+
+ if character == "\\":
+ self.reader.advance()
+ character = self.escaped_char(self.reader.char)
+ logger.debug("Escape sequence processed: %s", character.__repr__())
+
+ # Make sure it's correctly closed
+ self.reader.advance()
+ if self.reader.char != "'":
+ raise ValueError("Parse error: expected `'`, found " + self.reader.char)
+ self.reader.advance() # Discard closing '
+
+ logger.info("Lexed char %s", character.__repr__())
+ return (TokenType.character, character)
+
+ def comment(self):
+ while self.reader != "\n":
+ self.reader.advance()
+
+ def escaped_char(self, ch):
+ if ch == '"':
+ return '"'
+ elif ch == 'n':
+ return '\n'
+ elif ch == 't':
+ return '\t'
+ elif ch == '\\':
+ return '\\'
+ else:
+ # TODO: implement more escape sequences
+ return ""
+
+
+ def number(self):
+ """
+ Process anything that starts with a number. Could be:
+ - An actual number in hex, octal, binary or decimal
+ - Numeric labels or numeric label references
+ - floating point (not implemented yet)
+ """
+ numbstr = self.reader.char
+ self.reader.advance()
+
+ logger.info("Found number: %s", numbstr)
+
+ # Hex, Bin, Oct and the 0
+ if numbstr == "0":
+ reprid = self.reader.peek()
+ if reprid == "x":
+ self.reader.advance()
+ return self.hex()
+ elif reprid == "b":
+ self.reader.advance()
+ return self.bin()
+ elif reprid.isdigit():
+ return self.oct()
+ else:
+ return (TokenType.integer, 0)
+
+ # TODO: floating point numbers
+
+ # Decimal number
+ # consume a decimal number
+ while self.reader.char in decChars:
+ numbstr += self.reader.char
+ self.reader.advance()
+
+ # Numeric label reference
+ if self.reader.char in set("bf"):
+ numbstr += self.reader.char
+ self.reader.advance()
+ return (TokenType.identifier, numbstr)
+ # TODO: Return identifier or label?
+ # It's a label-ref not a label!
+
+ # Numeric label
+ if self.reader.char in ":":
+ self.reader.advance()
+ return (TokenType.label, numbstr)
+
+ return (TokenType.integer, int(numbstr))
+
+ def hex(self):
+ # TODO: Handle possible errors
+ numbstr = ""
+ while self.reader.char in hexChars:
+ numbstr += self.reader.char
+ self.reader.advance()
+ return (TokenType.integer, int(numbstr, 16))
+
+ def oct(self):
+ # TODO: Handle possible errors
+ numbstr = ""
+ while self.reader.char in octChars:
+ numbstr += self.reader.char
+ self.reader.advance()
+ return (TokenType.integer, int(numbstr, 8))
+
+ def bin(self):
+ # TODO: Handle possible errors
+ numbstr = ""
+ while self.reader.char in binChars:
+ numbstr += self.reader.char
+ self.reader.advance()
+ return (TokenType.integer, int(numbstr, 2))
+
+
+ def identifier(self):
+ logger.info("Found identifier")
+ s = ""
+ while self.reader.char.isalnum() or self.reader.char == "_":
+ s += self.reader.char
+ self.reader.advance()
+
+ if self.reader.char == ":":
+ self.reader.advance()
+ return (TokenType.label, s)
+ else:
+ return (TokenType.identifier, s)
+
+ def directive(self):
+ logger.info("Found directive")
+ s = ""
+ self.reader.advance() # Discard leading dot
+ while self.reader.char.isalnum() or self.reader.char == "_":
+ s += self.reader.char
+ self.reader.advance()
+ return (TokenType.directive, s)
+
+if __name__ == "__main__":
+ import sys
+ with Reader(sys.argv[1]) as src:
+ lexer = Lexer(src)
+ for token in lexer:
+ print(token)