语法分析器python版本
作者:互联网
可以直接去我的gitee空间下载代码,参考博客理论理解
1、题目:
1.2、理论
最终实现的结果:
2、代码
实现间接左递归、直接左递归、消除回溯以及题目要求信息
文件结构如下图:
Stack是自己实现栈的结构,Grammar是定义文法,Parser是转换,连接Grammar和Main
Main是程序入口
2.1、Stack.py
class Stack: # 栈的实现
def __init__(self):
self.items = []
def push(self, item):
self.items.append(item)
def pop(self):
if self.size() > 0:
return self.items.pop()
else:
return None
def peek(self):
if len(self.items) > 0:
return self.items[len(self.items) - 1]
else:
return None
def size(self):
return len(self.items)
2.2、Grammar.py
from pandas import DataFrame
from copy import deepcopy
class Grammar:
"""
对消除左递归和回溯的产生式进行语法分析,得到对应的LL(1)分析表:DataFrame类型
"""
Vn = [] # 非终结符集
Vt = [] # 终结符集
S = '' # 文法开始符
M = DataFrame() # 分析表
FIRST = {} # First集 {char:[]}
FOLLOW = {} # Follow集 {char:[]}
Production = {} # 产生式 {char:[]}
def __init__(self, filename) -> '输入原始的产生式':
self.Vn = [] # 非终结符集
self.Vt = [] # 终结符集
self.M = DataFrame() # 分析表
self.FIRST = {} # First集
self.FOLLOW = {} # Follow集
self.S = '' # 文法开始符
# 手工将测试的表达式写入文本文件,每个表达式写一行,用";"表示结束
# 读入文本文件中的表达式;
is_SetHead = True
with open(filename, 'r', encoding="utf-8") as f:
for lineStr in f:
# 清除无效字符
lineStr = lineStr.replace(' ', '')
lineStr = lineStr.replace('\t', '')
lineStr = lineStr.replace('\r', '')
lineStr = lineStr.replace('\n', '')
if is_SetHead and len(lineStr) > 0:
is_SetHead = False
self.S = lineStr[:lineStr.index('->')] # 设置文法开始符
if lineStr != '':
self.__saveInfoFromFile(lineStr) # 保存未处理的产生式Production、非终结符集Vn、终结符集Vt
self.__modifyProduction()
self.__saveProduction2File()
def __saveInfoFromFile(self, line) -> '预处理后的line保存产生式Production,非终结符集Vn,终结符集Vt':
if line.find("->") == -1:
raise Exception('语法错误')
# 获取产生式左侧
key = line[:line.index('->')]
if key == '':
raise Exception('语法错误')
self.Vn.append(key) # 保存终结符集Vt
tmpString = line[line.index('>') + 1:]
if tmpString == '':
raise Exception('语法错误')
leavesList = tmpString.strip(';').split('|')
self.Production[key] = deepcopy(leavesList) # 保存产生式Production
for i in range(len(tmpString)): # 遍历产生式右侧
ch = tmpString[i]
if not ch.isupper() and ch not in ['|', ';', 'ε', '\'']:
self.Vt.append(ch) # 保存终结符集Vt
# 1、消除左递归和消除回溯(即提取公因字符串再去除),主要实现在最后
def __modifyProduction(self) -> '预处理产生式':
# 理论知识参考<a href=https://blog.csdn.net/u010204038/article/details/41942035?utm_source=blogxgwz2></a>
# 1、把间接左递归文法改写为直接左递归文法
self.__removeIndirectLeftRecursion()
# 2、然后用消除直接左递归的方法改写文法。
self.__removeDirectLeftRecursion()
# 3、消除回溯
self.__removeSamePrefix()
def getVn(self):
return self.Vn
def getVt(self):
return self.Vt
def getS(self):
return self.S
def getFirst(self):
return self.FIRST
def getFollow(self):
return self.FOLLOW
def getTableM(self):
return self.M
def showProduction(self) -> '展示产生式':
for key, value in self.Production.items():
print(key, end='->')
for i in range(len(value)):
if i < len(value) - 1:
print(value[i], end="|")
else:
print(value[i])
def showFirst(self) -> '获取产生式中的First集合以字典类型':
for key, value in self.FIRST.items():
print('FIRST(%s' % key, end='):{')
tmpList = list(value)
for i in range(len(tmpList)):
if i < len(tmpList) - 1:
print(tmpList[i], end=",")
else:
print(tmpList[i], end='}\n')
def showFollow(self) -> '获取产生式中的Follow集合以字典类型':
for key, value in self.FOLLOW.items():
print('FOLLOW(%s' % key, end='):{')
tmpList = list(value)
for i in range(len(tmpList)):
if i < len(tmpList) - 1:
print(tmpList[i], end=",")
else:
print(tmpList[i], end='}\n')
def showTableM(self) -> '显示分析表':
if len(self.M) == 0:
self.setTableM()
print(self.M)
# 2、获取每个非终结符的First集合
def setFirst(self):
change = 1 # 如果first集合发生变化就继续,直到first集合不再变化
# 非终结符的First集合
while change:
change = 0
for i in self.Vn: # 遍历所有的非终结符
for j in self.Production[i]: # 遍历所有的非终结符
empty_flag = 0 # 标志是否有空元素ε
for indexCh in range(len(j)): # 对产生式的某一个叶子进行遍历
# 处理类似F'的'字符
if j[indexCh] == '\'':
continue
elif indexCh + 1 <= len(j) - 1 and j[indexCh + 1] == '\'':
nowIndex = indexCh
ch = j[indexCh] + j[indexCh + 1]
nowIndex += 1
while nowIndex + 1 <= len(j) - 1 and j[nowIndex + 1] == '\'':
ch = j[indexCh] + j[indexCh + 1]
nowIndex += 1
else:
ch = j[indexCh]
# 产生First的步骤1
if ch in self.Vt: # 是否是终结符,是则直接加入First集合
if i not in self.FIRST.keys():
self.FIRST[i] = []
if ch not in self.FIRST[i]:
self.FIRST[i].append(ch) # 是则直接加入First集合
change = 1
empty_flag = 0 # 没有空串ε
elif ch == 'ε': # 产生First的步骤2
empty_flag = 1
elif ch in self.FIRST.keys(): # 产生First的步骤3,非终结符的First
for k in self.FIRST[ch]: # ch已经建立好First集
if k == 'ε':
empty_flag = 1
else:
if i not in self.FIRST.keys():
self.FIRST[i] = []
if k not in self.FIRST[i]:
self.FIRST[i].append(k)
change = 1
empty_flag = 0
if empty_flag == 0: # 不含有空元素,不对同一个叶子继续遍历
break
if empty_flag == 1: # 每个非终结符的First的集合都有空串
if i not in self.FIRST.keys():
self.FIRST[i] = []
if 'ε' not in self.FIRST[i]:
self.FIRST[i].append('ε') # 是ε则直接加入First集合
change = 1
# 终结符的First集合是它本身
for i in self.Vt:
self.FIRST[i] = [i]
def getFirstByChar(self, ch) -> '获取单个字符的First集合':
if ch in self.FIRST:
return self.FIRST[ch]
else:
return []
# 3、获取每个非终结符的Follow集合
def setFollow(self):
# 需要先求出First集合
if len(self.FIRST) == 0:
self.setFirst()
# 语法规则1:
self.FOLLOW[self.S] = ['#']
change = 1
while change:
change = 0
for i in self.Vn: # 遍历所有的非终结符
for j in self.Production[i]: # 遍历非终结符对应的产生式的右侧的叶子们
for indexCh in range(len(j)): # 对产生式的某一个叶子进行遍历
nowIndex = indexCh # 代表当前字符下标
# 处理类似F'的'字符
if j[indexCh] == '\'':
continue
elif indexCh + 1 <= len(j) - 1 and j[indexCh + 1] == '\'':
ch = j[indexCh] + j[indexCh + 1]
nowIndex += 1
while nowIndex + 1 <= len(j) - 1 and j[nowIndex + 1] == '\'':
ch += j[nowIndex + 1]
nowIndex += 1
else:
ch = j[indexCh]
if ch in self.Vn: # 非终结符
# 语法规则第2条
if nowIndex + 1 <= len(j) - 1: # 形同A->aBb,取得子串b
s = j[nowIndex + 1] # 需要考虑E''的情况
tmpIndex = nowIndex + 1 # 代表当前字符的后一个字符下标
while tmpIndex + 1 <= len(j) - 1 and j[tmpIndex + 1] == '\'':
s += j[tmpIndex + 1]
tmpIndex += 1
se = self.getFirstByChar(s)
# A->aBb,把子串b的First集的所有非空元素加入B的Follow集合中
for k in se: # 遍历se,所有非空元素加入
if k != 'ε':
if ch not in self.FOLLOW.keys():
self.FOLLOW[ch] = []
if k not in self.FOLLOW[ch]:
self.FOLLOW[ch].append(k)
change = 1
if 'ε' in se: # 语法规则第3条第2项,把i的Follow集加入B的Follow集合中
if i in self.FOLLOW.keys():
for k in self.FOLLOW[i]: # 遍历i的Follow集
if ch not in self.FOLLOW.keys():
self.FOLLOW[ch] = []
if k not in self.FOLLOW[ch]:
self.FOLLOW[ch].append(k)
change = 1
else:
self.FOLLOW[i] = []
else: # 形同A->aB,语法规则第3条第1项,把i的Follow集加入B的Follow集合中
if i in self.FOLLOW.keys():
for k in self.FOLLOW[i]:
if ch not in self.FOLLOW.keys():
self.FOLLOW[ch] = []
if k not in self.FOLLOW[ch]:
self.FOLLOW[ch].append(k)
change = 1
def getFollowByChar(self, ch) -> '获取单个字符的Follow集合':
if ch in self.FIRST:
return self.FIRST[ch]
else:
return []
# 4、建立分析表
def setTableM(self) -> '需要经过预处理--消除左递归和回溯,再调用这个函数返回分析表':
if len(self.FOLLOW) == 0:
self.setFollow()
Columns = self.Vt + ['#']
Index = self.Vn
self.M = DataFrame(index=Index, columns=Columns)
for i in range(len(Index)): # 遍历非终结符
A = Index[i]
for j in self.Production[A]: # 遍历非终结符对应的产生式的右侧的叶子们
leaveString = A + "->" + j
nowIndex = 0
ch = j[0]
# 语法规则1:需要对非终结符的First集处理
if ch.isupper(): # 为非终结符,都是大写字母
while nowIndex + 1 <= len(j) - 1 and j[nowIndex + 1] == '\'':
ch += j[nowIndex + 1]
nowIndex += 1
for k in self.FIRST[ch]: # 遍历非终结符的First集
if k == 'ε':
continue
y = Columns.index(k)
x = i
self.M.values[x, y] = leaveString
elif ch in self.FIRST[A]: # 语法规则2:对自己First集的每个终结符
if ch != 'ε':
y = Columns.index(ch)
x = i
self.M.values[x, y] = leaveString
else: # 语法规则3:对任何属于Follow(A)的处理,注意:Follow集不可能有空串
for k in self.FOLLOW[A]: # 遍历
y = Columns.index(k)
x = i
self.M.values[x, y] = leaveString
self.M.fillna('ERROR', inplace=True)
# 1.1、消除间接左递归(要求文法不存在A 经过一次或多次能推导出A和不存在ε产生式(形如A→ε))
"""
前提:(1)文法中不含有回路(2)文法中不含有空串为右部的产生式
步骤:
(1)把非终结符整理成某个顺序
A1→δ1 /δ2 /…/δk
A2→A1r...
A3→A2u|A1v...
若多个非终结符都有间接左递归,其中排序排在最后的非终结符用作消除左递归
(2)如果一个文法不含有回路,即形如PP的推导,也不含有以ε为右部的产生式,那么就可以采用下述算法消除文法的所有左递归。
for(i=1;i<=n;i++)
for(j=1;j<=i-1;j++)
{ 把形如Ai→Ajγ的产生式改写成Ai→δ1γ /δ2γ /…/δkγ
其中Aj→δ1 /δ2 /…/δk是关于的Aj全部规则;
再消除Ai规则中的直接左递归;
}
(3)化简得到的文法:去掉从开始符号S出发,在推导中无法出现的非终结符的产生式(去掉多余产生式)
"""
def __removeIndirectLeftRecursion(self):
"""
测试案例:
S->Qc|c;
Q->Rb|b;
R->Sa|a;
"""
# 排序
Vn = self.Vn # 对Vn处理排序后的Vn副本
i = len(Vn) - 2
while i >= 0:
ch0 = self.Vn[i]
# 向后查看
for j in range(i + 1, len(Vn)):
ch1 = Vn[j]
rightList = self.Production[ch1]
for k in rightList:
if k[0] == ch0:
ch0Index = Vn.index(ch0)
ch1Index = Vn.index(ch1)
Vn[ch1Index], Vn[ch0Index] = Vn[ch0Index], Vn[ch1Index]
break
i -= 1
# 消除间接左递归
for i in range(1, len(Vn)): # 处理Ai->Ajγ等转换
for j in range(i):
isSame = False # 在i的产生式右侧的候选项中出现Vn[j]
ch = Vn[j]
Right1 = self.Production[Vn[i]] # Ai产生式的右侧
Right2 = self.Production[Vn[j]] # Aj产生式的右侧
tmpList = [] # 存放缓冲
for k1 in Right1: # 遍历Ai产生式右侧子叶
if k1[0] == ch: # Ai->Ajγ
isSame = True
for k2 in Right2: # 遍历Aj产生式右侧子叶
if k2 != 'ε':
tmpList.append(k2 + k1[1:])
if isSame: # 前面产生式右侧出现本产生式的左侧字母
Right1Size = len(Right1)
for _ in range(Right1Size): # 遍历Ai产生式右侧子叶
if Right1[0][0] == ch:
del self.Production[Vn[i]][0]
else:
tmpList.append(Right1[0])
del self.Production[Vn[i]][0]
self.Production[Vn[i]].extend(set(tmpList))
saveSet = {self.S}
saveSet = self._get_simple_Production(self.S, saveSet)
for item in self.Vn:
if item not in saveSet:
del self.Production[item]
self.Vn = [i for i in self.Production.keys()] # 可能增加Vn,这里需要更新处理
def _get_simple_Production(self, ch, result):
# 传入的是产生式左侧,在消除间接左递归的最后检查是否有可以删去的多余产生式
tmpResult = set()
tmpDict = self.Production[ch]
tmpSet = set(self.Vn) - {ch}
for item in tmpSet:
for rightItem in tmpDict:
if item in rightItem and item not in result:
tmpResult.update(item)
break
for item in tmpResult:
result.add(item)
result.update(self._get_simple_Production(item, result))
return result
# 1.2、消除直接左递归
"""
1、把所有产生式写成候选式形式。如A→Aa1|Aa2……|Aan|b1|b2……|bm。其中每个a都不等于ε,而第个b都不以A开头。
2、变换候选式成如下形式:
A→b1A’|b2A’……|bmA’
A’ →a1A’|a2A’……|anA’|ε
"""
def __removeDirectLeftRecursion(self):
for ch in list(self.Production.keys()):
newCh = ch + '\''
sameFlag = False
while newCh in self.Production.keys():
newCh += '\''
for k in self.Production[ch]:
if k[0] == ch:
sameFlag = True
self.Production[newCh] = []
break
if not sameFlag: # 没有出现和产生式左侧相同的,不需要继续
continue
tmpList = [] # 存放待替换原来的右侧的列表
# 删除原来右侧,增加新产生式右侧
oldList = self.Production[ch]
while len(oldList) > 0:
if oldList[0][0] == ch:
tmpStr = oldList[0][1:]
self.Production[newCh].append(tmpStr + newCh)
else:
tmpStr = oldList[0]
tmpList.append(tmpStr + newCh)
del oldList[0]
self.Production[newCh].append('ε') # 加入空串
# 重进原来的右侧
self.Production[ch].extend(tmpList)
self.Production = dict(sorted(self.Production.items(), key=lambda x: x[0]))
self.Vn = [i for i in self.Production.keys()] # 可能增加Vn,这里需要更新处理
# 1.3、消除回溯
def __removeSamePrefix(self):
samplesCopy = deepcopy(self.Production)
samples = {}
change = True
while change:
change = False
for oldLeftKey, rightList in samplesCopy.items():
# 判断是不是需要合并
if len(rightList) == 0:
continue
dictTmp = {}
i = 0
while i < len(rightList):
if rightList[i][0] in dictTmp.keys():
i += 1
continue
oldStartChar = rightList[i][0]
dictTmp[oldStartChar] = [rightList[i]]
j = i + 1
newStartChar = oldLeftKey + '\''
while newStartChar in dictTmp.keys() or newStartChar in samples.keys():
newStartChar += '\''
while j < len(rightList): # 遍历右侧列表
if rightList[j][0] == oldStartChar:
change = True
if newStartChar not in dictTmp.keys(): # 遇到后面首字母重复的考虑是否已经在字典中建立key
dictTmp[newStartChar] = []
if oldStartChar + newStartChar not in dictTmp[oldStartChar]: # 判断是否修改原理对应的字典key的valuelist
dictTmp[newStartChar].append(dictTmp[oldStartChar][0][1:])
dictTmp[oldStartChar] = [oldStartChar + newStartChar]
if rightList[j][1:] != '':
dictTmp[newStartChar].append(rightList[j][1:]) # 在新的字典key对应的valuelist中加入新字符串
else:
dictTmp[newStartChar].append('ε')
j += 1
i = i + 1
# 将本次得到的dictTmp更新到全局变量字典类型的samples
samples[oldLeftKey] = []
for key, valueList in dictTmp.items():
if len(valueList) == 1:
samples[oldLeftKey].extend(valueList)
else: # 可能还需要继续
samples.update({key: valueList})
samplesCopy = deepcopy(samples) # 更新当前全局变量到副本,再进入一次循环,看是否可以退出循环
self.Production = dict(sorted(samples.items(), key=lambda x: x[0]))
self.Vn = [i for i in self.Production.keys()] # 可能增加Vn,这里需要更新处理
# 1.4、保存产生式
def __saveProduction2File(self):
fileName = r'ModifiedRule.txt'
if len(self.Production) == 0:
return
with open(fileName, 'w') as f:
for key, valueList in self.Production.items():
f.write(key + '->' + valueList[0])
for i in range(1, len(valueList)):
f.write("|" + valueList[i])
f.write(';\n')
2.3、Parser.py
# 语法分析的主类
from pandas import DataFrame
from copy import deepcopy
from Stack import Stack
class Parser:
# M: 分析表
# inputString: 输入串
# vt: 终结符集合
# vn: 非终结符集合
__M, __inputString, __vt, __vn = DataFrame(), "", [], []
__stack = Stack() # 分析栈
def __init__(self, M, inputString, vt, vn):
"""
初始化语法分析器
:param M: 分析表:DataFrame类型
:param inputString: 输入串,以#结尾:str类型
:param vt: 终结符集合:list
:param vn: 非终结符集合:list,第一个是文法开始符S
----
Example:parser = Parser(DataFrame(), " ", [], ['S'])
"""
if len(inputString) == 0:
raise Exception("输入字符串不能为空")
if len(vn) == 0:
raise Exception("Vn是非空有限集的非终结符集,第一个元素是文法开始符,Vn不能为空")
self.__M, self.__inputString, self.__vt, self.__vn = deepcopy(M), deepcopy(inputString), deepcopy(vt), deepcopy(
vn)
self.__stack = Stack()
self.__stack.push('#')
self.__stack.push(self.__vn[0])
def setM(self, M):
self.__M = deepcopy(M)
def setInputString(self, inputString):
self.__inputString = inputString
def isVt(self, c):
"""
判断是否为终结符
:param c: 输入字符
:return: boolean结果
"""
return c in self.__vt
def isVn(self, c):
"""
判断是否为非终结符
:param c: 输入字符
:return: boolean结果
"""
return c in self.__vn
def __showStack(self, resultStack):
tmp = ''
for ch in self.__stack.items:
# print(ch, end="")
tmp += ch
# print("\t\t\t", end="")
resultStack.push(tmp)
"""
输出其分析过程,并在最后显示判断输入串是否为该文法的句子,并返回记录列表
"""
def parse(self):
if len(self.__inputString) == 0:
raise Exception("输入字符串不能为空")
resultStack = Stack() # 保留记录,可以显示在前端
# 输入的第一个字符a
a = self.__inputString[0]
# 栈顶元素
x = self.__stack.peek()
# 定义元素对应的下标
indexVt = -1 # 终结符对应下标
indexVn = -1 # 非终结符对应的下标
indexInput = 0 # 字符串当前指针对应的下标
# LL(1)的说明
# express = 'None'
# print('\033[31m符号栈', ' ' * 12, '当前输入符号', ' ' * 7, '输入串', ' ' * 7, '所用表达式\033[0m')
resultStack.push('符号栈')
resultStack.push('当前输入符号')
resultStack.push('输入串')
resultStack.push('所用表达式')
cycleMark = True # 循环是否继续
while cycleMark:
# 输出栈元素,传入resultStack,是为了保存值到resultStack,而不是遍历resultStack
self.__showStack(resultStack)
# 输出当前输入字符
# print('\t', a, "\t\t\t\t\t", end="")
resultStack.push(a)
# 输入串
# print(self.__inputString[indexInput + 1:], '\t\t\t\t', end="")
resultStack.push(self.__inputString[indexInput + 1:])
express = 'None'
x = self.__stack.pop() # 栈顶弹出
if self.isVt(x): # 判断是否为终结字符
if x == a:
if a == '#':
# 输出说明
express = '分析成功'
# print(express)
resultStack.push(express)
break # 成功,可以停止分析
# 弹出栈顶,字符串指针后移,对下个字符分析
indexInput += 1
a = self.__inputString[indexInput]
elif self.isVn(x): # 是非终结符
indexVn = self.__vn.index(x)
indexVt = self.__vt.index(a)
express = self.__M.values[indexVn][indexVt]
if '>' not in express:
raise Exception('语法错误')
# 产生式右侧分割成数组
tmpString = express[express.index('>') + 1:]
charArray = []
for i in range(len(tmpString)):
if tmpString[i] == '\'':
charArray[i - 1] += tmpString[i]
else:
charArray.append(tmpString[i])
# 判断首个字符是否为空串,是则跳过
if charArray[0] == 'ε':
# print(express)
resultStack.push(express)
continue
# 逆序把产生式右侧加入栈
for i in range(len(charArray) - 1, -1, -1):
self.__stack.push(charArray[i])
else:
raise Exception('未定义字符')
# 输出说明
# print(express)
resultStack.push(express)
if x == '#':
cycleMark = False
return resultStack
def __is_Chinese(self, word):
for ch in word:
if '\u4e00' <= ch <= '\u9fff':
return True
return False
def showResult(self):
try:
stack = self.parse()
for i in range(stack.size()):
if i % 4 == 0:
print()
if not self.__is_Chinese(stack.items[i]):
print("%10s" % stack.items[i], end=" " * 4)
else:
print('{0:{1}>6}'.format(stack.items[i], chr(12288)), end=' ' * 4)
return stack
except Exception as e:
print('\033[31m' + str(e) + '\033[0m')
return Stack()
2.4、Main.py
from Parser import Parser
from Grammar import Grammar
def main():
g = Grammar(r'PureRule.txt')
g.setTableM()
testM = g.getTableM()
Headers = testM.columns.values.tolist()
Index = testM.index.values.tolist()
# 终结符集
testVt = Headers
# 非终结符集
testVn = Index
# 输入字符串
testInputString = 'i+i#'
parser = Parser(testM, testInputString, testVt, testVn)
# 获得
try:
parser.showResult()
except Exception as e:
print('\033[31m'+str(e)+'\033[0m')
if __name__ == '__main__':
main()
"""
测试案例
# from pandas import DataFrame
# Index = ['E', 'E\'', 'T', 'T\'', 'F']
# Headers = ['i', '+', '*', '(', ')', '#']
# Data = [['E->TE\'', 'ERROR', 'ERROR', 'E->TE\'', 'ERROR', 'ERROR'],
# ['ERROR', 'E\'->+TE\'', 'ERROR', 'ERROR', 'E\'->ε', 'E\'->ε'],
# ['T->FT\'', 'ERROR', 'ERROR', 'T->FT\'', 'ERROR', 'ERROR'],
# ['ERROR', 'T\'->ε', 'T\'->*FT\'', 'ERROR', 'T\'->ε', 'T\'->ε'],
# ['F->i', 'ERROR', 'ERROR', 'F->(E)', 'ERROR', 'ERROR']]
# # 分析表,包含产生式的关系
# testM = DataFrame(Data, index=Index, columns=Headers)
"""
2.5、PureRule.txt
E->E+T|T;
T->T*F|F;
F->(E)|i;
对应生成的规则:
标签:ch,语法分析,python,self,Vn,Production,版本,终结符,def 来源: https://blog.csdn.net/weixin_44001521/article/details/112002614