自己動手寫數據庫系統: 實現一個小型 SQL 解釋器 -上-

數據庫系統有一個核心部件,那就是 SQL 解釋器。用過 mySQL 的同學都知道,我們需要寫一系列由 SQL 語言組成的代碼來驅動數據庫的運行,由此它就必須要有一個 SQL 語言解釋器來解讀 SQL 代碼,然後根據代碼的意圖來驅動數據庫執行相應的操作,本節我們就完成一個簡單的 SQL 解釋器。

解釋器的原理基於編譯原理,我在 B 站上專門有視頻解釋編譯原理算法,因此我在這裏不再贅述。實現一個解釋器的首要步驟就是完成一個詞法解析器,我在 B 站編譯原理視頻中實現過一個小型編譯器 (dragon-compiler),因此我將其對應的詞法解析器直接拿過來稍作改動,讓其能對 SQL 代碼進行詞法解析。首先我們把其中的 lexer 部分直接拷貝到我們現在的項目,打開其中的 token.go 文件,我們首先修改其中 token 的定義,將 SQL 語言中關鍵字的定義添加進去,然後去除與 SQL 無關的定義,修改後代碼如下:

package lexer

type Tag uint32

const (
    //AND 對應SQL關鍵字
    AND Tag = iota + 256
    //BREAK
    //DO
    EQ
    FALSE
    GE
    ID
    //IF
    //ELSE
    INDEX
    LE
    INT
    FLOAT
    MINUS
    PLUS
    NE
    NUM
    //OR
    REAL
    //TRUE
    //WHILE
    LEFT_BRACE    // "{"
    RIGHT_BRACE   // "}"
    LEFT_BRACKET  //"("
    RIGHT_BRACKET //")"
    AND_OPERATOR
    OR_OPERATOR
    ASSIGN_OPERATOR
    NEGATE_OPERATOR
    LESS_OPERATOR
    GREATER_OPERATOR
    BASIC //對應int , float, bool, char 等類型定義
    //TEMP  //對應中間代碼的臨時寄存器變量
    //SEMICOLON

    //新增SQL對應關鍵字
    SELECT
    FROM
    WHERE
    INSERT
    INTO
    VALUES
    DELETE
    UPDATE
    SET
    CREATE
    TABLE
    INT
    VARCHAR
    VIEW
    AS
    INDEX
    ON
    COMMA
    STRING
    //SQL關鍵字定義結束
    EOF

    ERROR
)

var token_map = make(map[Tag]string)

func init() {
    //初始化SQL關鍵字對應字符串
    token_map[AND] = "AND"
    token_map[SELECT] = "SELECT"
    token_map[WHERE] = "where"
    token_map[INSERT] = "INSERT"
    token_map[INTO] = "INTO"
    token_map[VALUES] = "VALUES"
    token_map[DELETE] = "DELETE"
    token_map[UPDATE] = "UPDATE"
    token_map[SET] = "SET"
    token_map[CREATE] = "CREATE"
    token_map[TABLE] = "TABLE"
    token_map[INT] = "INT"
    token_map[VARCHAR] = "VARCHAR"
    token_map[VIEW] = "VIEW"
    token_map[AS] = "AS"
    token_map[INDEX] = "INDEX"
    token_MAP[ON] = "ON"
    token_map[COMMA] = ","
    token_map[BASIC] = "BASIC"
    //token_map[DO] = "do"
    //token_map[ELSE] = "else"
    token_map[EQ] = "EQ"
    token_map[FALSE] = "FALSE"
    token_map[GE] = "GE"
    token_map[ID] = "ID"
    //token_map[IF] = "if"
    token_map[INT] = "int"
    token_map[FLOAT] = "float"

    token_map[LE] = "<="
    token_map[MINUS] = "-"
    token_map[PLUS] = "+"
    token_map[NE] = "!="
    token_map[NUM] = "NUM"
    token_map[OR] = "OR"
    token_map[REAL] = "REAL"
    //token_map[TEMP] = "t"
    token_map[TRUE] = "TRUE"
    //token_map[WHILE] = "while"
    //token_map[DO] = "do"
    //token_map[BREAK] = "break"
    token_map[AND_OPERATOR] = "&"
    token_map[OR_OPERATOR] = "|"
    token_map[ASSIGN_OPERATOR] = "="
    token_map[NEGATE_OPERATOR] = "!"
    token_map[LESS_OPERATOR] = "<"
    token_map[GREATER_OPERATOR] = ">"
    token_map[LEFT_BRACE] = "{"
    token_map[RIGHT_BRACE] = "}"
    token_map[LEFT_BRACKET] = "("
    token_map[RIGHT_BRACKET] = ")"
    token_map[EOF] = "EOF"
    token_map[ERROR] = "ERROR"
    //token_map[SEMICOLON] = ";"

}

type Token struct {
    lexeme string
    Tag    Tag
}

func (t *Token) ToString() string {
    if t.lexeme == "" {
        return token_map[t.Tag]
    }

    return t.lexeme
}

func NewToken(tag Tag) Token {
    return Token{
        lexeme: "",
        Tag:    tag,
    }
}

func NewTokenWithString(tag Tag, lexeme string) *Token {
    return &Token{
        lexeme: lexeme,
        Tag:    tag,
    }
}

在上面代碼修改中,我們把原來 C 語言的關鍵字去掉,增加了一系列 SQL 語言對應的關鍵字。打開文件 word_token.go,做如下修改:

package lexer

type Word struct {
    lexeme string
    Tag    Token
}

func NewWordToken(s string, tag Tag) Word {
    return Word{
        lexeme: s,
        Tag:    NewToken(tag),
    }
}

func (w *Word) ToString() string {
    return w.lexeme
}

func GetKeyWords() []Word {
    key_words := []Word{}
    key_words = append(key_words, NewWordToken("||", OR))
    key_words = append(key_words, NewWordToken("==", EQ))
    key_words = append(key_words, NewWordToken("!=", NE))
    key_words = append(key_words, NewWordToken("<=", LE))
    key_words = append(key_words, NewWordToken(">=", GE))
    //增加SQL語言對應關鍵字
    key_words = append(key_words, NewWordToken("AND", AND))
    key_words = append(key_words, NewWordToken("SELECT", SELECT))
    key_words = append(key_words, NewWordToken("FROM", FROM))
    key_words = append(key_words, NewWordToken("INSERT", INSERT))
    key_words = append(key_words, NewWordToken("INTO", INTO))
    key_words = append(key_words, NewWordToken("VALUES", VALUES))
    key_words = append(key_words, NewWordToken("DELETE", DELETE))
    key_words = append(key_words, NewWordToken("UPDATE", UPDATE))
    key_words = append(key_words, NewWordToken("SET", SET))
    key_words = append(key_words, NewWordToken("CREATE", CREATE))
    key_words = append(key_words, NewWordToken("TABLE", TABLE))
    key_words = append(key_words, NewWordToken("INT", INT))
    key_words = append(key_words, NewWordToken("VARCHAR", VARCHAR))
    key_words = append(key_words, NewWordToken("VIEW", VIEW))
    key_words = append(key_words, NewWordToken("AS", AS))
    key_words = append(key_words, NewWordToken("INDEX", INDEX))
    key_words = append(key_words, NewWordToken("ON", ON))

    //key_words = append(key_words, NewWordToken("minus", MINUS))
    //key_words = append(key_words, NewWordToken("true", TRUE))
    //key_words = append(key_words, NewWordToken("false", FALSE))
    //key_words = append(key_words, NewWordToken("if", IF))
    //key_words = append(key_words, NewWordToken("else", ELSE))
    //增加while, do關鍵字
    //key_words = append(key_words, NewWordToken("while", WHILE))
    //key_words = append(key_words, NewWordToken("do", DO))
    //key_words = append(key_words, NewWordToken("break", BREAK))
    //添加類型定義
    //key_words = append(key_words, NewWordToken("int", BASIC))
    //key_words = append(key_words, NewWordToken("float", BASIC))
    //key_words = append(key_words, NewWordToken("bool", BASIC))
    //key_words = append(key_words, NewWordToken("char", BASIC))

    return key_words
}

這裏的修改中也是把原來對應 C 語言的關鍵字去掉,增加上 SQL 語言的關鍵字定義。除了這些修改外,lexer 的基本邏輯沒有什麼變化,其代碼如下 (lexer.go):

package lexer

import (
    "bufio"
    "strconv"
    "strings"
    "unicode"
)

type Lexer struct {
    Lexeme       string
    lexemeStack  []string
    tokenStack   []Token
    peek         byte
    Line         uint32
    reader       *bufio.Reader
    read_pointer int
    key_words    map[string]Token
}

func NewLexer(source string) Lexer {
    str := strings.NewReader(source)
    source_reader := bufio.NewReaderSize(str, len(source))
    lexer := Lexer{
        Line:      uint32(1),
        reader:    source_reader,
        key_words: make(map[string]Token),
    }

    lexer.reserve()

    return lexer
}

func (l *Lexer) ReverseScan() {
    /*
        back_len := len(l.Lexeme)
        只能un read 一個字節
        for i := 0; i < back_len; i++ {
            l.reader.UnreadByte()
        }
    */
    if l.read_pointer > 0 {
        l.read_pointer = l.read_pointer - 1
    }

}

func (l *Lexer) reserve() {
    key_words := GetKeyWords()
    for _, key_word := range key_words {
        l.key_words[key_word.ToString()] = key_word.Tag
    }
}

func (l *Lexer) Readch() error {
    char, err := l.reader.ReadByte() //提前讀取下一個字符
    l.peek = char
    return err
}

func (l *Lexer) ReadCharacter(c byte) (bool, error) {

    chars, err := l.reader.Peek(1)
    if err != nil {
        return false, err
    }

    peekChar := chars[0]
    if peekChar != c {
        return false, nil
    }

    l.Readch() //越過當前peek的字符
    return true, nil
}

func (l *Lexer) UnRead() error {
    return l.reader.UnreadByte()
}

func (l *Lexer) Scan() (Token, error) {

    if l.read_pointer < len(l.lexemeStack) {
        l.Lexeme = l.lexemeStack[l.read_pointer]
        token := l.tokenStack[l.read_pointer]
        l.read_pointer = l.read_pointer + 1
        return token, nil
    } else {
        l.read_pointer = l.read_pointer + 1
    }

    for {
        err := l.Readch()
        if err != nil {
            return NewToken(ERROR), err
        }

        if l.peek == ' ' || l.peek == '\t' {
            continue
        } else if l.peek == '\n' {
            l.Line = l.Line + 1
        } else {
            break
        }
    }

    l.Lexeme = ""

    switch l.peek {
    case ',':
        l.Lexeme = ","
        l.lexemeStack = append(l.lexemeStack, l.Lexeme)
        token := NewToken(COMMA)
        l.tokenStack = append(l.tokenStack, token)
        return token, nil
    case '{':
        l.Lexeme = "{"
        l.lexemeStack = append(l.lexemeStack, l.Lexeme)
        token := NewToken(LEFT_BRACE)
        l.tokenStack = append(l.tokenStack, token)
        return token, nil
    case '}':
        l.Lexeme = "}"
        l.lexemeStack = append(l.lexemeStack, l.Lexeme)
        token := NewToken(RIGHT_BRACE)
        l.tokenStack = append(l.tokenStack, token)
        return token, nil
    case '+':
        l.Lexeme = "+"
        l.lexemeStack = append(l.lexemeStack, l.Lexeme)
        token := NewToken(PLUS)
        l.tokenStack = append(l.tokenStack, token)
        return token, nil
    case '-':
        l.Lexeme = "-"
        l.lexemeStack = append(l.lexemeStack, l.Lexeme)
        token := NewToken(MINUS)
        l.tokenStack = append(l.tokenStack, token)
        return token, nil
    case '(':
        l.Lexeme = "("
        l.lexemeStack = append(l.lexemeStack, l.Lexeme)
        token := NewToken(LEFT_BRACKET)
        l.tokenStack = append(l.tokenStack, token)
        return token, nil
    case ')':
        l.Lexeme = ")"
        l.lexemeStack = append(l.lexemeStack, l.Lexeme)
        token := NewToken(RIGHT_BRACKET)
        l.tokenStack = append(l.tokenStack, token)
        return token, nil
    case '&':
        l.Lexeme = "&"
        if ok, err := l.ReadCharacter('&'); ok {
            l.Lexeme = "&&"
            word := NewWordToken("&&", AND)
            l.lexemeStack = append(l.lexemeStack, l.Lexeme)
            l.tokenStack = append(l.tokenStack, word.Tag)
            return word.Tag, err
        } else {
            l.lexemeStack = append(l.lexemeStack, l.Lexeme)
            token := NewToken(AND_OPERATOR)
            l.tokenStack = append(l.tokenStack, token)
            return token, err
        }
    case '|':
        l.Lexeme = "|"
        if ok, err := l.ReadCharacter('|'); ok {
            l.Lexeme = "||"
            word := NewWordToken("||", OR)
            l.lexemeStack = append(l.lexemeStack, l.Lexeme)
            l.tokenStack = append(l.tokenStack, word.Tag)
            return word.Tag, err
        } else {
            l.lexemeStack = append(l.lexemeStack, l.Lexeme)
            token := NewToken(OR_OPERATOR)
            l.tokenStack = append(l.tokenStack, token)
            return token, err
        }

    case '=':
        l.Lexeme = "="
        if ok, err := l.ReadCharacter('='); ok {
            l.Lexeme = "=="
            word := NewWordToken("==", EQ)
            l.lexemeStack = append(l.lexemeStack, l.Lexeme)
            l.tokenStack = append(l.tokenStack, word.Tag)
            return word.Tag, err
        } else {
            l.lexemeStack = append(l.lexemeStack, l.Lexeme)
            token := NewToken(ASSIGN_OPERATOR)
            l.tokenStack = append(l.tokenStack, token)
            return token, err
        }

    case '!':
        l.Lexeme = "!"
        if ok, err := l.ReadCharacter('='); ok {
            l.Lexeme = "!="
            word := NewWordToken("!=", NE)
            l.lexemeStack = append(l.lexemeStack, l.Lexeme)
            l.tokenStack = append(l.tokenStack, word.Tag)
            return word.Tag, err
        } else {
            l.lexemeStack = append(l.lexemeStack, l.Lexeme)
            token := NewToken(NEGATE_OPERATOR)
            l.tokenStack = append(l.tokenStack, token)
            return token, err
        }

    case '<':
        l.Lexeme = "<"
        if ok, err := l.ReadCharacter('='); ok {
            l.Lexeme = "<="
            word := NewWordToken("<=", LE)
            l.lexemeStack = append(l.lexemeStack, l.Lexeme)
            l.tokenStack = append(l.tokenStack, word.Tag)
            return word.Tag, err
        } else {
            l.lexemeStack = append(l.lexemeStack, l.Lexeme)
            token := NewToken(LESS_OPERATOR)
            l.tokenStack = append(l.tokenStack, token)
            return token, err
        }

    case '>':
        l.Lexeme = ">"
        if ok, err := l.ReadCharacter('='); ok {
            l.Lexeme = ">="
            word := NewWordToken(">=", GE)
            l.lexemeStack = append(l.lexemeStack, l.Lexeme)
            l.tokenStack = append(l.tokenStack, word.Tag)
            return word.Tag, err
        } else {
            l.lexemeStack = append(l.lexemeStack, l.Lexeme)
            token := NewToken(GREATER_OPERATOR)
            l.tokenStack = append(l.tokenStack, token)
            return token, err
        }

    case '"':
        for {
            err := l.Readch()
            if l.peek == '"' {
                haveSeenQuote = false
                l.lexemeStack = append(l.lexemeStack, l.Lexeme)
                token := NewToken(STRING)
                l.tokenStack = append(l.tokenStack, token)
                return token, nil
            }

            if err != nil {
                panic("string no end with quota")
            }
            l.Lexeme += string(l.peek)
        }

    }

    if unicode.IsNumber(rune(l.peek)) {
        var v int
        var err error
        for {
            num, err := strconv.Atoi(string(l.peek))
            if err != nil {
                if l.peek != 0 { //l.peek == 0 意味着已經讀完所有字符
                    l.UnRead() //將字符放回以便下次掃描
                }

                break
            }
            v = 10*v + num
            l.Lexeme += string(l.peek)
            l.Readch()
        }

        if l.peek != '.' {
            l.lexemeStack = append(l.lexemeStack, l.Lexeme)
            token := NewToken(NUM)
            token.lexeme = l.Lexeme
            l.tokenStack = append(l.tokenStack, token)
            return token, err
        }
        l.Lexeme += string(l.peek)
        l.Readch() //越過 "."

        x := float64(v)
        d := float64(10)
        for {
            l.Readch()
            num, err := strconv.Atoi(string(l.peek))
            if err != nil {
                if l.peek != 0 { //l.peek == 0 意味着已經讀完所有字符
                    l.UnRead() //將字符放回以便下次掃描
                }

                break
            }

            x = x + float64(num)/d
            d = d * 10
            l.Lexeme += string(l.peek)
        }
        l.lexemeStack = append(l.lexemeStack, l.Lexeme)
        token := NewToken(REAL)
        token.lexeme = l.Lexeme
        l.tokenStack = append(l.tokenStack, token)
        return token, err
    }

    if unicode.IsLetter(rune(l.peek)) {
        var buffer []byte
        for {
            buffer = append(buffer, l.peek)
            l.Lexeme += string(l.peek)

            l.Readch()
            if !unicode.IsLetter(rune(l.peek)) {
                if l.peek != 0 { //l.peek == 0 意味着已經讀完所有字符
                    l.UnRead() //將字符放回以便下次掃描
                }
                break
            }
        }

        s := string(buffer)
        token, ok := l.key_words[s]
        if ok {
            l.lexemeStack = append(l.lexemeStack, l.Lexeme)
            l.tokenStack = append(l.tokenStack, token)
            return token, nil
        }

        l.lexemeStack = append(l.lexemeStack, l.Lexeme)
        token = NewToken(ID)
        token.lexeme = l.Lexeme
        l.tokenStack = append(l.tokenStack, token)
        return token, nil
    }

    return NewToken(EOF), nil
}

爲了節省篇幅,這裏我沒有把所有文件對應改動都貼出來,請在 B 站搜索”coding 迪斯尼” 查看詳細內容,下面我們調用上面實現的代碼試試效果,在 main.go 中添加如下測試代碼:

import (
    "fmt"
    "lexer"
)

func main() {
    sqlLexer := lexer.NewLexer("select name , sex from student where age > 20")
    var tokens []*lexer.Token
    tokens = append(tokens, lexer.NewTokenWithString(lexer.SELECT, "select"))
    tokens = append(tokens, lexer.NewTokenWithString(lexer.ID, "name"))
    tokens = append(tokens, lexer.NewTokenWithString(lexer.COMMA, ","))
    tokens = append(tokens, lexer.NewTokenWithString(lexer.ID, "sex"))
    tokens = append(tokens, lexer.NewTokenWithString(lexer.FROM, "from"))
    tokens = append(tokens, lexer.NewTokenWithString(lexer.ID, "student"))
    tokens = append(tokens, lexer.NewTokenWithString(lexer.WHERE, "where"))
    tokens = append(tokens, lexer.NewTokenWithString(lexer.ID, "age"))
    tokens = append(tokens, lexer.NewTokenWithString(lexer.GREATER_OPERATOR, ">"))
    tokens = append(tokens, lexer.NewTokenWithString(lexer.NUM, "20"))

    for _, tok := range tokens {
        sqlTok, err := sqlLexer.Scan()
        if err != nil {
            fmt.Println("lexer error")
            break
        }

        if sqlTok.Tag != tok.Tag {
            errText := fmt.Sprintf("token err, expect: %v, but got %v\n", tok, sqlTok)
            fmt.Println(errText)
            break
        }
    }

    fmt.Println("lexer testing pass...")
}

通過運行可以發現,最後一句”lexer testing pass…” 能正常打印出來,因此詞法解析器的基本邏輯是正確的。接下來看看語法解析的實現,基於篇幅所限,這裏我們只處理 SQL 的一小部分,有興趣的同學可以自行補全我們這裏完成的 SQL 解釋器,首先我們先定義要解析的 SQL 語法部分:

FIELD -> ID
CONSTANT -> STRING | NUM
EXPRESSION -> FIELD | CONSTANT
TERM -> EXPRESSION EQ EXPRESSION
PREDICATE -> TERM (AND PREDICATE)?

QUERY -> SELECT SELECT_LIST FROM TABLE_LIST (WHERE PREDICATE)?
SELECTION_LIST -> FIELD (COMMA SELECTION_LIST)?
TABLE_LIST -> ID (COMMA TABLE_LIST)?

UPDATE_COMMAND -> INSERT_COMMAND | DELETE_COMMAND | MODIFY_COMMAND | CREATE_COMMAND
CREATE_COMMAND -> CREATE_TABLE | CREATE_VIEW | CREATE_INDEX
INSERT_COMMAND -> INSERT INTO ID LEFT_BRACE FIELD_LIST RIGHT_BRACE VALUES CONSTANT_LIST
FIELD_LIST -> FIELD (COMMA FIELD_LIST)?
CONSTANT_LIST -> CONSTANT (COMMA CONSTANT_LIST)?

DELETE_COMMAND -> DELETE FROM ID (WHERE PREDICATE)?

MODIFY_COMMAND -> UPDATE ID SET FIELD EQ EXPRESSION (WHERE PREDICATE)?

CREATE_TABLE -> CREATE TABLE FIELD_DEFS
FIELD_DEFS -> FIELD_DEF (COMMA FIELD_DEFS)?
FIELD_DEF -> ID TYPE_DEF
TYPE_DEF -> INT | VARCHAR LEFT_BRACE NUM RIGHT_BRACE

CREATE_VIEW -> CREATE VIEW ID AS QUERY
CREATE_INDEX -> CREATE INDEX ID ON ID LEFT_BRACE FIELD RIGHT_BRACE

接下來我們看看如何通過上面語法規則對 SQL 代碼進行解析。這裏我們採用自頂向下的遞歸式解析法,具體算法過程可以參考我在 b 站的編譯原理視頻。在工程中新建一個文件夾叫 parser,然後再裏面添加 parser.go 文件,爲了簡單起見,我們一次完成一小部分,然後調用完成的代碼看看結果是否正確,首先我們完成 TERM 這條規則的解析,代碼如下:

package parser

import (
    "lexer"
    "query"
    "strconv"
    "strings"
)

type SQLParser struct {
    sqlLexer lexer.Lexer
}

func NewSQLParser(s string) *SQLParser {
    return &SQLParser{
        sqlLexer: lexer.NewLexer(s),
    }
}

func (p *SQLParser) Field() (lexer.Token, string) {
    tok, err := p.sqlLexer.Scan()
    if err != nil {
        panic(err)
    }
    if tok.Tag != lexer.ID {
        panic("Tag of FIELD is no ID")
    }

    return tok, p.sqlLexer.Lexeme
}

func (p *SQLParser) Constant() *query.Constant {
    tok, err := p.sqlLexer.Scan()
    if err != nil {
        panic(err)
    }

    switch tok.Tag {
    case lexer.STRING:
        s := strings.Clone(p.sqlLexer.Lexeme)
        return query.NewConstantWithString(&s)
        break
    case lexer.NUM:
        //注意堆棧變量在函數執行後是否會變得無效
        v, err := strconv.Atoi(p.sqlLexer.Lexeme)
        if err != nil {
            panic("string is not a number")
        }
        return query.NewConstantWithInt(&v)
        break
    default:
        panic("token is not string or num when parsing constant")
    }

    return nil
}

func (p *SQLParser) Expression() *query.Expression {
    tok, err := p.sqlLexer.Scan()
    if err != nil {
        panic(err)
    }

    if tok.Tag == lexer.ID {
        p.sqlLexer.ReverseScan()
        _, str := p.Field()
        return query.NewExpressionWithString(str)
    } else {
        p.sqlLexer.ReverseScan()
        constant := p.Constant()
        return query.NewExpressionWithConstant(constant)
    }
}

func (p *SQLParser) Term() *query.Term {
    lhs := p.Expression()
    tok, err := p.sqlLexer.Scan()
    if err != nil {
        panic(err)
    }
    if tok.Tag != lexer.ASSIGN_OPERATOR {
        panic("should have = in middle of term")
    }

    rhs := p.Expression()
    return query.NewTerm(lhs, rhs)
}

TERM 規則解析的是類似這樣的表達式”age <20”, “name = ‘jim’ “等嚐出現在 where 右邊的表達式。我們調用上面解析代碼進行測試看看,在 main.go 中輸入如下代碼:

import (
    "fmt"
    "parser"
)

func main() {
    sqlParser := parser.NewSQLParser("age = 20")
    term := sqlParser.Term()
    s := fmt.Sprintf("term: %v\n", term)
    fmt.Println(s)
}

請到 B 站查看我對上面代碼進行調試演示的過程,這樣更容易理解和喫透代碼邏輯。接下來我們繼續完成如下語法的解析:
PREDICATE -> TERM (AND PREDICATE)?
QUERY -> SELECT SELECT_LIST FROM TABLE_LIST (WHERE PREDICATE)?
SELECTION_LIST -> FIELD (COMMA SELECTION_LIST)?
TABLE_LIST -> ID (COMMA TABLE_LIST)?

這裏需要注意的是 PREDICATE 對應的是 where 後面的部分,例如 where a > b and c <d,這條語句中”a>b and c < d” 就是語法中的 PREDICATE,對應代碼如下:

func (p *SQLParser) Predicate() *query.Predicate {
    //predicate 對應where 語句後面的判斷部分,例如where a > b and c < b
    //這裏的a > b and c < b就是predicate
    pred := query.NewPredicateWithTerms(p.Term())
    tok, err := p.sqlLexer.Scan()
    // 如果語句已經讀取完則直接返回
    if err != nil && fmt.Sprint(err) != "EOF" {
        panic(err)
    }

    if tok.Tag == lexer.AND {
        pred.ConjoinWith(p.Predicate())
    } else {
        p.sqlLexer.ReverseScan()
    }

    return pred
}

func (p *SQLParser) Query() *QueryData {
    //query 解析select 語句
    tok, err := p.sqlLexer.Scan()
    if err != nil {
        panic(err)
    }

    if tok.Tag != lexer.SELECT {
        panic("token is not select")
    }

    fields := p.SelectList()
    tok, err = p.sqlLexer.Scan()
    if err != nil {
        panic(err)
    }

    if tok.Tag != lexer.FROM {
        panic("token is not from")
    }

    //獲取select語句作用的表名
    tables := p.TableList()
    //判斷select語句是否有where子句
    tok, err = p.sqlLexer.Scan()
    if err != nil {
        panic(err)
    }

    pred := query.NewPredicate()
    if tok.Tag == lexer.WHERE {
        pred = p.Predicate()
    } else {
        p.sqlLexer.ReverseScan()
    }

    return NewQueryData(fields, tables, pred)
}

func (p *SQLParser) SelectList() []string {
    //SELECT_LIST 對應select關鍵字後面的列名稱
    l := make([]string, 0)
    _, field := p.Field()
    l = append(l, field)

    tok, err := p.sqlLexer.Scan()
    if err != nil {
        panic(err)
    }
    if tok.Tag == lexer.COMMA {
        //selct 多個列,每個列由逗號隔開
        selectList := p.SelectList()
        l = append(l, selectList...)
    } else {
        p.sqlLexer.ReverseScan()
    }

    return l
}

func (p *SQLParser) TableList() []string {
    //TBALE_LSIT對應from後面的表名
    l := make([]string, 0)
    tok, err := p.sqlLexer.Scan()
    if err != nil {
        panic(err)
    }
    if tok.Tag != lexer.ID {
        panic("token is not id")
    }

    l = append(l, p.sqlLexer.Lexeme)
    tok, err = p.sqlLexer.Scan()
    if err != nil {
        panic(err)
    }
    if tok.Tag == lexer.COMMA {
        tableList := p.TableList()
        l = append(l, tableList...)
    } else {
        p.sqlLexer.ReverseScan()
    }

    return l
}

新增一個 go 文件名爲 query_data.go,我們使用數據結構 QueryData 來存儲 select 語句的解析結果,起內容如下:

package parser

//QueryData 用來描述select語句的操作信息
import (
    "query"
)

type QueryData struct {
    fields []string
    tables []string
    pred   *query.Predicate
}

func NewQueryData(fields []string, tables []string, pred *query.Predicate) *QueryData {
    return &QueryData{
        fields: fields,
        tables: tables,
        pred:   pred,
    }
}

func (q *QueryData) Fields() []string {
    return q.fields
}

func (q *QueryData) Tables() []string {
    return q.tables
}

func (q *QueryData) Pred() *query.Predicate {
    return q.pred
}

func (q *QueryData) ToString() string {
    result := "select "
    for _, fldName := range q.fields {
        result += fldName + ", "
    }

    // 去掉最後一個逗號
    result = result[:len(result)-1]
    result += " from "
    for _, tableName := range q.tables {
        result += tableName + ", "
    }
    // 去掉最後一個逗號
    result = result[:len(result)-1]
    predStr := q.pred.ToString()
    if predStr != "" {
        result += " where " + predStr
    }

    return result
}

假設有 SQL 語句如下:

select age, name, sex from student, department where age = 20 and sex = "male"

那麼我們就能調用上面代碼中的 Query 來啓動解析,其中 select 後面的列表名也就是”age, name, sex” 由函數 SelectList 負責解析,from 後面的表名由函數 TableList 負責解析,where 後面的內容由 Predicate 解析,其中他會把 age = 20 和 sex = “male“ 解析成 expression,我們在 main.go 中添加如下代碼,以便調用起上面代碼:

import (
    "fmt"
    "parser"
)

func main() {
    sqlParser := parser.NewSQLParser("select age, name, sex from student, department where age = 20 and sex = \"male\" ")
    queryData := sqlParser.Query()
    fmt.Println(queryData.ToString())
}

具體的調試演示過程請大家參看 b 站上的視頻,通過調試演示我們才能更好的理解解析邏輯。由於本節內容較多,我們將其分割成幾個小節來處理。

本文由 Readfog 進行 AMP 轉碼,版權歸原作者所有。
來源https://mp.weixin.qq.com/s/xAGYvpmPKFrVnGQZSgAtxg