summaryrefslogtreecommitdiff
path: root/pysc-v/Frontend/lexer.py
diff options
context:
space:
mode:
Diffstat (limited to 'pysc-v/Frontend/lexer.py')
-rw-r--r--pysc-v/Frontend/lexer.py269
1 files changed, 0 insertions, 269 deletions
diff --git a/pysc-v/Frontend/lexer.py b/pysc-v/Frontend/lexer.py
deleted file mode 100644
index c4a9a7d..0000000
--- a/pysc-v/Frontend/lexer.py
+++ /dev/null
@@ -1,269 +0,0 @@
-# TODO Logging is interesting for debugging purposes, decide what to do with it
-from logger import newlogger
-logger = newlogger(__name__)
-#import logging
-#logger.setLevel(logging.WARN)
-
-
-######
-
-from enum import Enum
-from reader import Reader
-
-binChars = set("01")
-octChars = set("01234567")
-decChars = set("0123456789")
-hexChars = set("0123456789ABCDEFabcdef")
-
-class TokenType(Enum):
- """
- These are the possible tokens that the lexer knows, they are converted to
- their most accurate representation in python.
- """
- identifier = 0 # Represented as strings
- label = 1 # Represented as strings or integer if they are numeric
- instruction = 2 # Represented as strings
- directive = 3 # Represented as strings
- integer = 4 # Represented as int
- character = 5 # Represented as strings of length 1
- string = 6 # Represented as strings
- float = 7 # Represented as ??
- # (we need correct conversions to the binary value)
- # TODO
- end = 8
- argsep = 9
- openparens = 10
- closeparens = 11
-
-class Lexer:
- def __init__(self, reader):
- self.reader = reader
- self.tokenstart = 0
-
- def __iter__(self):
- return self
-
- def __next__(self):
- if self.reader.char is None:
- self.reader.advance()
- while True:
- try:
- # Instruction end
- if self.reader.char == "\n" or self.reader.char == ";":
- self.reader.advance()
- return (TokenType.end, None)
-
- # Argument separator
- elif self.reader.char == ",":
- self.reader.advance()
- return (TokenType.argsep, None)
-
- # String
- elif self.reader.char == '"':
- return self.string()
-
- # Character
- elif self.reader.char == "'":
- return self.character()
-
- # Comment
- elif self.reader.char == "#":
- self.comment()
-
- # Parenthesis
- # Load a register as an address + an offset
- elif self.reader.char == '(':
- self.reader.advance()
- return (TokenType.openparens, None)
- elif self.reader.char == ')':
- self.reader.advance()
- return (TokenType.closeparens, None)
-
- # Starts with digit:
- # - Numbers (any kind)
- # - Numeric Labels
- # - Numeric Label references
- # - Load register as address: ld a1, 4(a0)
- # ^^^^^
- # offset + reg
- elif self.reader.char.isdigit() or self.reader.char == "-":
- return self.number()
-
- # Identifiers
- elif self.reader.char.isalpha():
- return self.identifier()
-
- elif self.reader.char == "":
- break # FILE END
-
- else:
- # TODO: Remove this, it's just for testing
- self.reader.advance()
- except Exception as e:
- # Handle exceptions
- # raise StopIteration
- raise e
- raise StopIteration
-
- def string (self):
- logger.info("Found string")
- self.reader.advance() # Ignore opening quotes
-
- string = ""
- escaped = False # Set if previous character was a backslash
-
- while self.reader.char != '"' or escaped:
- if self.reader.char == "":
- # TODO: Check how to do this
- raise ValueError("Error: string not closed, found EOF")
- if self.reader.char == "\n":
- # TODO: Consider the string as closed and continue but report
- # the error?
- raise ValueError("Error: string not closed, found newline")
-
- if escaped:
- string += self.escaped_char(self.reader.char)
- logger.debug("Escape sequence processed %s",
- string[-1].__repr__())
- escaped = False
- continue
- if self.reader.char == "\\":
- escaped = True
- self.reader.advance()
- continue
- escaped = False
- string += self.reader.char
- self.reader.advance()
- logger.info("Lexed string %s", string.__repr__())
- self.reader.advance() # Discard closing "
- return (TokenType.string, string)
-
- def character (self):
- logger.info("Found character")
- self.reader.advance() # Ignore the opening quote
- character = self.reader.char
- if character == "":
- # TODO: Check how to do this
- raise ValueError("Error: found EOF")
- if not character.isprintable() :
- raise ValueError("Error: Non printable character")
-
- if character == "\\":
- self.reader.advance()
- character = self.escaped_char(self.reader.char)
- logger.debug("Escape sequence processed: %s", character.__repr__())
-
- # Make sure it's correctly closed
- self.reader.advance()
- if self.reader.char != "'":
- raise ValueError("Parse error: expected `'`, found " + self.reader.char)
- self.reader.advance() # Discard closing '
-
- logger.info("Lexed char %s", character.__repr__())
- return (TokenType.character, character)
-
- def comment(self):
- while self.reader != "\n":
- self.reader.advance()
-
- def escaped_char(self, ch):
- if ch == '"':
- return '"'
- elif ch == 'n':
- return '\n'
- elif ch == 't':
- return '\t'
- elif ch == '\\':
- return '\\'
- else:
- # TODO: implement more escape sequences
- return ""
-
-
- def number(self):
- """
- Process anything that starts with a number. Could be:
- - An actual number in hex, octal, binary or decimal
- - Numeric labels or numeric label references
- - floating point (not implemented yet)
- """
- numbstr = self.reader.char
- self.reader.advance()
-
- logger.info("Found number: %s", numbstr)
-
- # Hex, Bin, Oct and the 0
- if numbstr == "0":
- reprid = self.reader.peek()
- if reprid == "x":
- self.reader.advance()
- return self.hex()
- elif reprid == "b":
- self.reader.advance()
- return self.bin()
- elif reprid.isdigit():
- return self.oct()
- else:
- return (TokenType.integer, 0)
-
- # TODO: floating point numbers
-
- # Decimal number
- # consume a decimal number
- while self.reader.char in decChars:
- numbstr += self.reader.char
- self.reader.advance()
-
- # Numeric label reference
- if self.reader.char in set("bf"):
- numbstr += self.reader.char
- self.reader.advance()
- return (TokenType.identifier, numbstr)
- # TODO: Return identifier or label?
- # It's a label-ref not a label!
-
- # Numeric label
- if self.reader.char in ":":
- self.reader.advance()
- return (TokenType.label, numbstr)
-
- return (TokenType.integer, int(numbstr))
-
- def hex(self):
- # TODO: Handle possible errors
- numbstr = ""
- while self.reader.char in hexChars:
- numbstr += self.reader.char
- self.reader.advance()
- return (TokenType.integer, int(numbstr, 16))
-
- def oct(self):
- # TODO: Handle possible errors
- numbstr = ""
- while self.reader.char in octChars:
- numbstr += self.reader.char
- self.reader.advance()
- return (TokenType.integer, int(numbstr, 8))
-
- def bin(self):
- # TODO: Handle possible errors
- numbstr = ""
- while self.reader.char in binChars:
- numbstr += self.reader.char
- self.reader.advance()
- return (TokenType.integer, int(numbstr, 2))
-
-
- def identifier(self):
- s = ""
- while self.reader.char.isalnum() or self.reader.char == "_":
- s += self.reader.char
- self.reader.advance()
- return (TokenType.identifier, s)
-
-if __name__ == "__main__":
- import sys
- with Reader(sys.argv[1]) as src:
- lexer = Lexer(src)
- for token in lexer:
- print(token)