Lexical analysis using Python

By Andreas Schickedanz May 10, 2013

I was always very facinated of everything about compilers, interpreters, parsers and lexers. Now I was wondering how I could implement such things using Python. I decided to write a simple lexer for algebraic expressions that converts a sequence of characters into a sequence of tokens. Till now I just implemented such things using C or C++ and so I started the simplest way as I learned by reading the Dragon Book.

First of all I created a simple enum type that allows me to create enums as used to from languages like C++ and Java. Then I implemented a Token class that holds all information about a token and enables me to easily print a token using the Python’s magic str method. Finally I added a Lexer class that processes character by character and returns the tokens.

This resulted in the following code:

#!/usr/bin/python
import sys
import os
import re

def enum(*sequential, **named):
    enums = dict(zip(sequential, range(len(sequential))), **named)
    reverse = dict((value, key) for key, value in enums.iteritems())
    enums['key'] = reverse
    return type('Enum', (), enums)

Type = enum(
	'PLUS', 'MINUS', 'TIMES', 'DIV', 'MOD', 'EXP', 'FAC',
	'FUNC', 'VAR', 'NUMBER', 'LBRACE', 'RBRACE'
)

class Token(object):
	def __init__(self, type, alias, value):
		self.type = type
		self.alias = alias
		self.value = value

	def __str__(self):
		return '{2} <{1}>'.format(self.type, self.alias, self.value)

class Lexer(object):
	def __init__(self, stream):
		# Initialize class attributes ...
		self.stream = stream
		self.current = None
		self.offset = -1
		self.matchRes = None

		# ... and get the first character.
		self.getChar()

	def nextToken(self):
		if self.current is None:
			return None

		self.skipWhitespaces()

		if self.current == '+' and self.lookbefore() not in ('+','-','^','/','%'):
			return Token(Type.PLUS, Type.key[Type.PLUS], self.getChar())
		elif self.current == '-' and self.lookbefore() not in ('+','-','^','/','%'):
			return Token(Type.MINUS, Type.key[Type.MINUS], self.getChar())
		elif self.current == '*':
			return Token(Type.TIMES, Type.key[Type.TIMES], self.getChar())
		elif self.current == '/':
			return Token(Type.DIV, Type.key[Type.DIV], self.getChar())
		elif self.current == '%':
			return Token(Type.MOD, Type.key[Type.MOD], self.getChar())
		elif self.current == '^':
			return Token(Type.EXP, Type.key[Type.EXP], self.getChar())
		elif self.current == '!':
			return Token(Type.FAC, Type.key[Type.FAC], self.getChar())
		elif self.current == '(':
			return Token(Type.LBRACE, Type.key[Type.LBRACE], self.getChar())
		elif self.current == ')':
			return Token(Type.RBRACE, Type.key[Type.RBRACE], self.getChar())
		elif self.match('[a-zA-Z_][a-zA-Z0-9_]*(?=([ \t]+)?\()'):
			return Token(Type.FUNC, Type.key[Type.FUNC], self.getMatch())
		elif self.match('[+-]?[a-zA-Z_][a-zA-Z0-9_]*(?!([ \t]+)?\()'):
			return Token(Type.VAR, Type.key[Type.VAR], self.getMatch())
		elif self.match('[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?'):
			return Token(Type.NUMBER, Type.key[Type.NUMBER], self.getMatch())
		else:
			print 'No match found'

	def skipWhitespaces(self):
		while self.current.isspace():
			self.getChar()

	def getChar(self):
		if self.offset + 1 >= len(self.stream):
			return None

		result = self.stream[self.offset]
		self.offset += 1
		self.current = self.stream[self.offset]

		return result

	def getMatch(self):
		if self.matchRes is None:
			return None

		result = self.matchRes.group(0)
		self.offset += len(result)

		if self.offset + 1 >= len(self.stream):
			self.current = None
		else:
			self.current = self.stream[self.offset]

		return result

	def match(self, format):
		# Prepare the given pattern ...
		pattern = re.compile(format)

		# ... and match the pattern against the stream.
		self.matchRes = re.match(pattern, self.stream[self.offset:])

		if self.matchRes is not None:
			return True
		else:
			return False

	def lookahead(self):
		return self.stream[self.offset+1:self.offset+2]

	def lookbefore(self):
		return self.stream[self.offset-1:self.offset]

def main(argv):
	form = argv[0]

	# Do something with this data.
	if form is not None:
		lex = Lexer(form)
		token = lex.nextToken()
		while token is not None:
			print token
			token = lex.nextToken()
	else:
		print 'Invalid parameters.'
		sys.exit(1)

if __name__ == "__main__":
	main(sys.argv[1:])

But this code is very long and it took some time until it worked. Fortunately, I stumbled over Python’s Regex module documentation where I found a simple tokenizer based on regular expressions. So I wrote a new script that uses this technique to parse algebraic expressions. This resulted in a much smaller script:

#!/usr/bin/python
import collections
import sys
import re

def tokenize(stream):
	Token = collections.namedtuple('Token', ['typ', 'value', 'line', 'column'])

	tokenSpec = [
		# Arithmetic operators
		('OPERATOR',	r'(?<!--[\+\-\^\*/%])[\+\-]|[\^\*/%!]'),<br /-->
		# Function identifiers
		('FUNCTIONID',	r'[a-zA-Z_][a-zA-Z0-9_]*(?=([ \t]+)?\()'),
		# Variable identifiers
		('VARIABLEID',	r'[+-]?[a-zA-Z_][a-zA-Z0-9_]*(?!([ \t]+)?\()'),
		# Any numeric value (decimal or floating point)
		('NUMBER',		r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?'),
		# Left brace
		('LBRACE',		r'[(]'),
		# Right brace
		('RBRACE',		r'[)]'),
		# Assignment operator
		('ASSIGN',		r':='),
		# Line endings
		('NEWLINE',		r'\n'),
		# Skip over spaces and tabs
		('SKIP',			r'[ \t]'),
	]

	tokenRegex = '|'.join('(?P<%s>%s)' % pair for pair in tokenSpec)
	keywords = {
		'+': 'PLUS', '-': 'MINUS', '*': 'TIMES', '/': 'DIV',
		'^': 'EXP', '%': 'MOD', '!': 'FAC'
	}

	nextToken = re.compile(tokenRegex).match

	# Setup the line start and current position ...
	pos = lineStart = 0

	# ... as well as the current line.
	line = 1

	# Fetch the first token ...
	token = nextToken(stream)

	# ... and start the processing.
	while token is not None:
		# Fetch the token type ...
		typ = token.lastgroup

		# ... and increment line counter if it is a newline.
		if typ == 'NEWLINE':
			lineStart = pos
			line += 1
		elif typ != 'SKIP':
			# Fetch the token value ...
			value = token.group(typ)

			# ... and handle keywords.
			if typ == 'OPERATOR' and value in keywords.keys():
				typ = keywords[value]

			yield Token(typ, value, line, token.start() - lineStart)

		pos = token.end()
		token = nextToken(stream, pos)
	if pos != len(stream):
		raise TokenizerException('Unexpected character %r on line %d' % (stream[pos], line))

def main(argv):
	form = argv[0]

	# Do something with this data.
	if form is not None:
		for token in tokenize(form):
			print token
	else:
		print 'Invalid parameters.\n'
		sys.exit(1)

if __name__ == "__main__":
	main(sys.argv[1:])

But as life goes, once you have a running solution, a friend tells you that there is a far simpler and more efficient way to implement this. Yeah, you are right, René told me about the pyparsing library. I couldn’t resist and so I implemented the same process using an other approach and all this just for fun. This script was again a little bit shorter than the regular expression based solution and brings another benefit: You are not only able to implement the lexical analysis in just a few lines, you could also do the parsing using this library by adding just a few lines.

#!/usr/bin/env python
from pyparsing import Word, Regex, Literal, OneOrMore, ParseException
import sys
import re

def main(argv):
	form = argv[0]

	# Do something with this data.
	if form is not None:
		try:
			operator = Regex(r'(?<!--[\+\-\^\*/%])[\+\-]|[\^\*/%!]')<br /-->
			function = Regex(r'[a-zA-Z_][a-zA-Z0-9_]*(?=([ \t]+)?\()')
			variable = Regex(r'[+-]?[a-zA-Z_][a-zA-Z0-9_]*(?!([ \t]+)?\()')
			number = Regex(r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?')
			lbrace = Word('(')
			rbrace = Word(')')
			assign = Literal(':=')
			linebreak = Word('\n')
			skip = Word(' \t')

			lexOnly = operator | function | variable | number | lbrace \
				| rbrace | assign | linebreak | skip
			lexAllOnly = OneOrMore(lexOnly)

			print lexAllOnly.parseString(form)
		except ParseException, err:
			print err.line
			print " "*(err.column-1) + "^"
			print err
	else:
		print 'Invalid parameters.\n'
		sys.exit(1)

if __name__ == "__main__":
	main(sys.argv[1:])

For me it was very fascinating to see how many possibilties Python offers to do lexical analysis and how powerful regular expressions are. From one day of fun it became a day of many insights. Maybe I will delving deeper into the topic when I have a bit more time.

I hope you enjoyed this short overview of the different abilities of Python to tokenize a sequence of characters. I am looking forward to see your implementations and read your recommendations. Until next time, happy coding.

Phidelux is a Computer Science MSc. interested in hardware hacking, embedded Linux, compilers, etc.