手写编译器--破产版词法分析器

时间 2019-11-07

原文原文链接

忽然感觉到了编译原理的重要性，并且也是算法的落地方向之一，好记性不如烂代码，今天我就穿起万里长征的第一只鞋，先手写个破产版的玩玩。
先来一个小目标，至少要认出我写的东西吧......
词法分析器通常好像都是基于有限状态机的，此处的好像体现了说明文的语言准确性，反正无非就是读字符串，看看当前的字符组合是否知足某个状态，知足就保持，不知足就切换，切换的时候把当前字符组合所造成的的状态保存为一个Token。其实蛮简单的，整一台先：算法

//有限自动机
type DfaState int

const (
	_ DfaState = iota
	Initial             //初始状态
	Identifier          //标识符
	IntLiteral          //数字字面量 123
	SemiColon           //分号 ;
	LeftParen           //左括号 (
	RightParen          //右括号 )
	Assignment          //赋值号 =
	Plus                //加号 +
	Minus               //减号 +
	Star                //乘号 *
	Slash               //除号 /
	//把关键字分开写比较好判断
	Int
	Id_Int1 //int关键字里的i
	Id_Int2 //int关键字里的n
	Id_Int3 //int关键字里的t
)
复制代码

下边就是核心的东西了，目前只能识别诸如int age = 45 这种的，以后会逐渐地完善，至少要有个计算器功能嘛，代码里见，轻喷哦bash

type Lexer interface {
	Tokenize(code string)
	Print()
}

type tokenS struct {
	tokenType token.TokenType //这块实际上是当前的状态，暂时被我定成字符串了哈哈
	text      string
}

type lexer struct {
	tmpToken tokenS
	tokens   []tokenS
}

func New() Lexer {
	return &lexer{}
}
func (l *lexer) Print() {
	for _, v := range l.tokens {
		fmt.Println(v.text + " " + string(v.tokenType))
	}
}
func (l *lexer) Tokenize(code string) {
	state := Initial
	ch := uint8(0)
	//平民遍历字符串，见谅
	codeLen := len(code)
	for i := 0; i < codeLen; i++ {
		ch = code[i]
		switch state {
		case Initial:
			state = l.initToken(ch)
		case Identifier:
			if isAlpha(ch) || isNumber(ch) {
				l.tmpToken.text += string(ch)
			} else {
				state = l.initToken(ch)
			}
		case Assignment, Plus, Minus, Star, Slash:
			state = l.initToken(ch)
		case IntLiteral:
			if isNumber(ch) {
				l.tmpToken.text += string(ch)
			} else {
				state = l.initToken(ch)
			}
		case Id_Int1:
			if ch == 'n' { //先看是不是int
				l.tmpToken.text += string(ch)
				state = Id_Int2
			} else if isAlpha(ch) || isNumber(ch) { //不是int就看是否是知足标识符
				l.tmpToken.text += string(ch)
				state = Identifier
			} else {
				state = l.initToken(ch)
			}
		case Id_Int2:
			if ch == 't' { //先看是不是int
				l.tmpToken.text += string(ch)
				state = Id_Int3
			} else if isAlpha(ch) || isNumber(ch) { //不是int就看是否是知足标识符
				l.tmpToken.text += string(ch)
				state = Identifier
			} else {
				state = l.initToken(ch)
			}
		case Id_Int3:
			if isBlank(ch) {
				l.tmpToken.tokenType = token.Int
				state = l.initToken(ch)
			} else {
				state = Identifier
				l.tmpToken.text += string(ch)
			}
		}
	}
	//这里须要把最后一个token也加进去
	if len(l.tmpToken.text) > 0 {
		l.initToken(ch)
	}
}

//每次状态的切换都须要从新进入，而且跳向其余状态
func (l *lexer) initToken(ch byte) DfaState {
	if len(l.tmpToken.text) > 0 {
		l.tokens = append(l.tokens, l.tmpToken)
		l.tmpToken = tokenS{}
	}
	state := Initial
	if isAlpha(ch) {
		if ch == 'i' {
			state = Id_Int1
		} else {
			state = Identifier
		}
		l.tmpToken.tokenType = token.Identifier
		l.tmpToken.text += string(ch)
	} else if isNumber(ch) {
		state = IntLiteral
		l.tmpToken.tokenType = token.IntLiteral
		l.tmpToken.text += string(ch)
	} else if ch == '+' {
		state = Plus
		l.tmpToken.tokenType = token.Plus
		l.tmpToken.text += string(ch)
	} else if ch == '-' {
		state = Minus
		l.tmpToken.tokenType = token.Minus
		l.tmpToken.text += string(ch)
	} else if ch == '*' {
		state = Star
		l.tmpToken.tokenType = token.Star
		l.tmpToken.text += string(ch)
	} else if ch == '/' {
		state = Slash
		l.tmpToken.tokenType = token.Slash
		l.tmpToken.text += string(ch)
	} else if ch == '(' {
		state = LeftParen
		l.tmpToken.tokenType = token.LeftParen
		l.tmpToken.text += string(ch)
	} else if ch == ')' {
		state = RightParen
		l.tmpToken.tokenType = token.RightParen
		l.tmpToken.text += string(ch)
	} else if ch == '=' {
		state = Assignment
		l.tmpToken.tokenType = token.Assignment
		l.tmpToken.text += string(ch)
	}
	return state
}
//是否为字母
func isAlpha(ch byte) bool {
	return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')
}
//是否为数字
func isNumber(ch byte) bool {
	return ch >= '0' && ch <= '9'
}
//是否为空格
func isBlank(ch byte) bool {
	return ch >= ' ' || ch == '\t' || ch == '\n'
}

复制代码

测试一下哈哈app

func main() {
	lx := lexer.New()
	lx.Tokenize("int age = 45")
	lx.Print()
}
复制代码

课程周期半年，争取一步步完成一个解释器，一个编译器，增删改查虽然解饿，可是真的很差吃啊。测试

算法梦想家，来跟我一块儿玩算法，玩音乐，聊聊文学创做，我们一块儿天马行空！ ui