编程语言
首页 > 编程语言> > 编译原理-词法分析 python实现

编译原理-词法分析 python实现

作者:互联网

上学的时候,老师让写实验。我看到这破旧的vc++6.0,心生厌恶,于是申请用python写。
老师同意了,那么就有了我的代码。
我分享出来,希望供大家参考。

词法分析的python代码参考了C语言版本的代码 : https://www.cnblogs.com/zyrblog/p/6885922.html

# 参考C语言代码 : https://www.cnblogs.com/zyrblog/p/6885922.html
import re
import copy

class Scanner(object):
    # 保留字 -- 1
    reserveWord = [
        "auto", "break", "case", "char", "const", "continue",
        "default", "do", "double", "else", "enum", "extern",
        "float", "for", "goto", "if", "int", "long",
        "register", "return", "short", "signed", "sizeof", "static",
        "struct", "switch", "typedef", "union", "unsigned", "void",
        "volatile", "while"
    ]

    # 标识符 -- 2
    # 无符号整形数 -- 3

    # 运算符 -- 4
    operator = [
        "+", "-", "*", "/", "<", "<=", ">", ">=", "=", "==",
        "!=","&","&&","|","||","%","<<",">>","+="
    ]

    # 分隔符 -- 5
    Delimiter = [
        ";", "(", ")", "^", ",", "\"", "\'","[","]","{","}"
    ]

    # 错误符 -- 6

# ------------------------------------------------------------------------------------------------------------- #

    # 判断是否为保留字 -- 1
    def searchReserve(self, reserveWord):
        if reserveWord in self.reserveWord:
            return True
        else:
            return False # 表示不是保留字,是标识符
        pass

    # 判断是否为字母
    def IsLetter(self, letter):
        if re.match(r'[a-zA-Z\_]', letter): # 正则表达式
            return True
        else:
            return False

    # 判断是否为数字
    def IsDigit(self, digit):
        if re.match(r'\d', digit):
            return True
        else:
            return False
        pass

    # 判断是否为运算符或者分隔符
    def IsSign(self, Sign):
        if Sign in self.Delimiter:
            return 5
        elif Sign in self.operator:
            return 4
        else:
            return -1

    # 过滤器,过滤掉注释
    def filterResource(self, code):
        note = 0
        code_temp=[]
        for line in range(len(code)):
            a = ''
            s_line = code[line]
            i = -1
            while i < len(s_line)-1:
                i = i + 1
                if i<=len(s_line)-2 and s_line[i]=='/' and s_line[i+1]=='/' and note == 0:
                    break    # 跳过单行注释

                if i<=len(s_line)-2 and s_line[i]=='/' and s_line[i+1]=='*':
                    note = 1
                    continue   # 注释开始

                if i<=len(s_line)-2 and s_line[i]=='*' and s_line[i+1]=='/':
                    note = 0
                    i = i + 2
                    continue # 注释结束

                # 跳过无用符号
                if note == 0 and s_line[i]!='\t' and s_line[i]!='\n' and s_line[i]!='\v' and s_line[i]!='\r':
                    a = a + s_line[i]

            # print(a)
            if a != '':
                code_temp.append(a)

        code = copy.deepcopy(code_temp)

        return code


def clear_number(number_list):
    number_list[0] = number_list[1] = 0
    pass

def clear_sign(sign_list):
    sign_list[0] = ''
    sign_list[1] = 0

def clear_word(word):
    word = ''

# 主程序
# 读取文件
scanner = Scanner()
code = []
with open('D:/test.txt', 'r',encoding='UTF-8') as f: # 读取
    for line in f.readlines():
        code.append(line.strip())

    # 代码过滤
    code = scanner.filterResource(code)
    # print(code)

    # 代码识别:
    # 字符分为符号和非符号,非符号之间用空格隔开,符号和非符号之间不需要隔开
    # 利用空格或者符号进行识别
    for line in code:
        word = ''
        number = 0
        number_e = 0

        sign = ''
        sign_e = 0
        number_list = [number, number_e]
        sign_list = [sign, sign_e]

        number_or_word = 0
        number_plus = 1

        i = -1
        while i < len(line)-1:
            i=i+1
            bit = line[i]
            number_or_word = 0

            # (识别符号)符号打头,顺便去除符号-前面-的字母或者数字
            if scanner.IsSign(bit) > 0 and sign_list[1] > 0 and scanner.IsSign(sign_list[0]+bit) < 0:
                # 如果两个连续的符号不是符号,输出第一个符号,继续
                print('(', sign_list[1], ',"', sign_list[0], '")')
                clear_sign(sign_list)

            if scanner.IsSign(bit) > 0: # 把符号保存
                sign_list[0] = sign_list[0] + bit
                sign_list[1] = scanner.IsSign(bit)
            if scanner.IsSign(bit) > 0 and word != '':
                # 字母+符号,输出字母
                if scanner.searchReserve(word) == True:
                    print('(', 1, ',"', word, '")')
                else:
                    print('(', 2, ',"', word, '")')
                word = ''
                number_or_word=0
                # number_plus = 1
            elif scanner.IsSign(bit) > 0 and number_list[1] != 0:
                # 数字+符号,输出数字
                print('(', 3, ',"', number_plus*number_list[0], '")')
                clear_number(number_list)
                number_or_word = 0
                number_plus = 1


            #字母打头(识别单词)
            if scanner.IsLetter(bit) and sign_list[1] > 0:
                # 符号 + 字母,识别符号,继续字母
                print('(', sign_list[1], ',"', sign_list[0], '")')
                clear_sign(sign_list)


            if scanner.IsLetter(bit) and number_list[1]==0:   # 遇见字母
                word = word + bit
                number_or_word = 1    # 标识前一个字符是单词
                continue
            elif word != '' and scanner.IsDigit(bit): #字母加数字
                word = word + str(bit)
                continue
            elif word != '' and bit == ' ': # 字母加空格
                if scanner.searchReserve(word) == True:
                    print('(', 1, ',"', word, '")')
                else:
                    print('(', 2, ',"', word, '")')
                word = ''
                continue


            # 数字打头(识别数字)
            if scanner.IsDigit(bit)==True and sign_list[1] > 0:
                # 符号 + 数字,识别符号,继续数字
                # 在这里识别正负号!!
                if sign_list[0] not in ['+', '-']:  # 如果符号不是正负号
                    print('(', sign_list[1], ',"', sign_list[0], '")')
                    clear_sign(sign_list)
                elif number_or_word > 0:# 如果前面存在数字或者单词,那么这个符号就是运算符
                    pass
                elif number_or_word == 0:#符号是正负号,纳入数字
                    number_plus = int(sign_list[0]+'1 ')
                    clear_sign(sign_list)
                # number_or_word = 1

            if scanner.IsDigit(bit)==True and word == '':  # 数字打头
                number_list[1] = 1
                number_list[0] = number_list[0] * 10 + int(bit)
                number_or_word = 1  # 标识前一个字符是数字
            elif number_list[1] == 1 and scanner.IsLetter(bit):
                # 说明是数字 + 字母 ,是错误的
                print('Error : (', 6, ',"', str(number_list[0]) + bit, '")')
                clear_number(number_list)
                clear_sign(sign_list)
                word = ''
            elif number_list[1] == 1 and bit == ' ':
                # 遇到空格,数字识别成功,输出数字
                print('(', 3, ',"', number_plus*number_list[0], '")')
                number_plus = 1
                clear_number(number_list)

            if bit == ' ' and sign_list[1] != 0:
                # 符号加空格,输出符号,清空标志,继续
                print('(', sign_list[1], ',"', sign_list[0], '")')
                clear_sign(sign_list)
            if i == len(line) - 1:
                print('(', sign_list[1], ',"', sign_list[0], '")')

标签:word,scanner,python,list,number,sign,词法,编译,bit
来源: https://www.cnblogs.com/amtop/p/16436985.html