筛dateType为特定值,并进行roomLiveTitle去重
作者:互联网
#筛选类型数据和标题并进行标题去重 class ShaiXuanLeiXingAndBiaoTi: def __init__(self,file_name): self.file_name = file_name self.mubiao_list = [] self.sheqi_list=[] self.read_list=self.readText(self.file_name) self.end_num = 0 # 读取文件,以列表形式获取所有内容 def readText(self,file_name): with open(file=file_name, mode='r',encoding="utf-8") as f: read_list = f.readlines() return read_list #处理DataType所在行的数据,获取dataType的值 def getDataType(self,datatype_hang): datatype_list = datatype_hang.split(":") print("datatype_list:") print(datatype_list) ziduan_zhi_list = datatype_list[1].split(",") print("ziduan_zhi_list:") print(ziduan_zhi_list) ziduan_zhi = ziduan_zhi_list[0].strip() print("ziduan_zhi:") print(ziduan_zhi) print(type(ziduan_zhi)) return ziduan_zhi #获取要从保留的内容中删除,添加到删除列表中数据 def getDeleteDataList(self): print("处理前self.mubiao_list个数") print(len(self.mubiao_list)) print(self.mubiao_list) zhongjian_list = [] num = 0 while True: num = num + 1 print("删除目标,保存数据到中间列表中,循环处理第%s次" % str(num)) #获取 self.mubiao_list 中最后一项的内容 zuihouyixiang = self.mubiao_list[-1] zuihouyixiang_pan= zuihouyixiang.strip() #去掉前后空格 print("zuihouyixiang:") print(zuihouyixiang) self.mubiao_list.pop() # self.mubiao_list删除最后一项 zhongjian_list.append(zuihouyixiang) # 判断该行内容是否是""data": {",如果是则终止循环 if zuihouyixiang_pan == '"data": {': print("zuihouyixiangdai{") print(zuihouyixiang) break # 获取 self.mubiao_list 中最后一项的内容 zuihouyixiang = self.mubiao_list[-1] self.mubiao_list.pop() # self.mubiao_list删除最后一项 zhongjian_list.append(zuihouyixiang) print("处理后self.mubiao_list个数") print(len(self.mubiao_list)) print("zhongjian_list个数") print(len(zhongjian_list)) return zhongjian_list #将zhongjian_list中的内容导向存储到self.sheqi_list中 def getSheQiList(self,zhongjian_list): print("zhongjian_list:") print(zhongjian_list) zhongjian_list_len = len(zhongjian_list) for i in range(0,zhongjian_list_len): self.sheqi_list.append(zhongjian_list[-1]) #将zhongjian_list的最后一项加入到self.sheqi_list zhongjian_list.pop() #删除zhongjian_list的最后一项 print("self.sheqi_list:") print(self.sheqi_list) # 向下继续查找,找到第一个”},“,则停止 def xiangxia(self): k=0 for j in range(0,10): k=k+1 if (self.end_num+j < self.read_list_len): print("处理到self.read_list中第%s下表的的内容" % str(self.end_num+j)) one_hang = self.read_list[self.end_num+j] self.sheqi_list.append(one_hang) if "}" in one_hang.strip(): print("one_hang},") print(one_hang) break print("k:") print(k) return k #写入列表数据到文件中 def writeListToTxt(self,file_name,list_data): with open(file_name,"w",encoding="utf-8") as f: for one in list_data: f.write(one) #写入字符串内容到文件中 def writeStrToTxt(self,file_name,str): with open(file_name,"w",encoding="utf-8") as f: f.write(str) #处理一个实体对象,即一个中括号对象 # { # "data": { # "tagList": [ # "测试" # ], # "roomLiveTitle": "直播间标题-大会直播", # "coverOne": "\/tojoy\/tojoyClould\/backstageSystem\/image\/1633680869417.jpg", # "screenShot": "\/tojoy\/tojoyClould\/serverUpload\/202207\/14\/image\/1657783758511.jpg", # "roomLiveId": 4003879, # "coverTwo": "\/tojoy\/tojoyClould\/backstageSystem\/image\/1633680876038.jpg", # "status": 4, # "videoPlayUrl": "http:\/\/1259323955.vod2.myqcloud.com\/685cdfeevodcq1259323955\/b520d1e2387702293080090030\/f0.mp4?oss-cn-beijing.aliyuncs.com" # }, # "dataIndex": 1, # "dataType": 4, # "dataSource": 3 # }, def handleOneShiTi(self): # 遍历每一行 #第一次开始处理 for i in range(0,self.read_list_len): print("处理到第%s行的内容" % str(i)) #读取一行内容 one_hang = self.read_list[i] #如果 dataType 不在该行中 if ziduan.lower() not in one_hang.lower(): # 将一行内容添加到self.mubiao_list,即要保留的内容 self.mubiao_list.append(one_hang) self.end_num = self.end_num + 1 else: # 如果判断 dataType 在该行中,则判断dataType的值 print("遇到第一个%s值不是%s的%s行的内容" % (ziduan, ziduanzhi,i)) print(one_hang) data_type_zhi = self.getDataType(datatype_hang=one_hang) print(data_type_zhi) #如果字段值不是4 if data_type_zhi != ziduanzhi: break # 终止循环 else: # 将一行内容添加到self.mubiao_list,即要保留的内容 self.mubiao_list.append(one_hang) self.end_num = self.end_num + 1 print("处理到self.mubiao_list第%s个下标" % str(self.end_num)) print("从断点下标%s开始处理" % str(self.end_num)) zhongjian_list = self.getDeleteDataList() print("self.mubiao_list_hou:") print(self.mubiao_list) print(len(self.mubiao_list)) # 将zhongjian_list的内容倒向保存到self.sheqi_list中 self.getSheQiList(zhongjian_list) #向下继续查找,找到第一个”}“,则停止 k = self.xiangxia() self.end_num = self.end_num+k print("接着从断点 %s行开始处理" % str(self.end_num)) #循环处理后续内容 while self.end_num <self.read_list_len: # 第二次开始处理 # 接着从self.end_num开始读取,此时需要再进行判断 for i in range(self.end_num, self.read_list_len): print("处理到第%s行的内容" % str(i)) # 读取一行内容 one_hang = self.read_list[i] # 如果 dataType 不在该行中 if ziduan.lower() not in one_hang.lower(): # 将一行内容添加到self.mubiao_list,即要保留的内容 self.mubiao_list.append(one_hang) self.end_num = self.end_num + 1 else: # 如果判断 dataType 在该行中,则判断dataType的值 print("遇到一个%s值不是%s的%s行的内容" % (ziduan, ziduanzhi, i)) print(one_hang) data_type_zhi = self.getDataType(datatype_hang=one_hang) print(data_type_zhi) # 如果字段值不是4(预期值) if data_type_zhi != ziduanzhi: break # 终止循环 else: # 将一行内容添加到self.mubiao_list,即要保留的内容 self.mubiao_list.append(one_hang) self.end_num = self.end_num + 1 print("处理到self.mubiao_list第%s个下标" % str(self.end_num)) print("从断点下标%s开始处理" % str(self.end_num)) #如果起始值大于等于数列长度 if self.end_num>=self.read_list_len: print("从断点下标%s超过超过数列%s长度,终止循环" % (str(self.end_num),str(self.read_list_len))) break #就终止while循环 zhongjian_list = self.getDeleteDataList() print("self.mubiao_list_hou:") print(self.mubiao_list) print(len(self.mubiao_list)) # 将zhongjian_list的内容倒向保存到self.sheqi_list中 self.getSheQiList(zhongjian_list) # 向下继续查找,找到第一个”},“,则停止 k = self.xiangxia() self.end_num = self.end_num + k print("接着从断点 %s行开始处理" % str(self.end_num)) def handleFile(self,ziduan="dataType",ziduanzhi="4"): print("self.read_list:") print(self.read_list) #获取数列的长度 self.read_list_len = len(self.read_list) print("self.read_list_len:") print(self.read_list_len) self.handleOneShiTi() # #将最后一行的内容写入到mubiao_list中 # self.mubiao_list.append(self.read_list[-1]) print(self.read_list_len) print(len(self.mubiao_list)) print(len(self.sheqi_list)) self.writeListToTxt(file_name="mubiao_%s_%s.txt"%(ziduan,ziduanzhi), list_data=self.mubiao_list) self.writeListToTxt(file_name="sheqi_not_%s_%s.txt"%(ziduan,ziduanzhi), list_data=self.sheqi_list) return "mubiao_%s_%s.txt"%(ziduan,ziduanzhi) #统计某个字段在列表中出现的次数 #将一个列表倒序写入另一个列表 def firstListDaoXuTOSecondList(self,first_list,second_list): first_list_len = len(first_list) #循环写入 for i in range(0,first_list_len): zuihouyixiang = first_list[-1] second_list.append(zuihouyixiang) first_list.pop() #删除最后一项 return second_list #将一个列表正序写入另一个列表 def firstListZhengXuTOSecondList(self,first_list,second_list): first_list_len = len(first_list) #循环写入 for i in range(0,first_list_len): second_list.append(first_list[i]) return second_list #根据一条重复的内容筛选出对应的整个对象 def getOneShiTiByOneZiduan(self,one_list,quanji_list): #获取一个对象的内容 one_duixiang_list= [] quanji_list_len = len(quanji_list) #获取总长度 xiabiao = int(one_list[0]) #存储一个实体对象的前半部分 one_duixiang_qian_list = [] #从下标开始,往上读取,读到第一个 ""data": {",如果是则终止循环 while True: one_duixiang_qian_list.append(quanji_list[xiabiao]) if '"data": {'.lower() in str(quanji_list[xiabiao]).lower(): break else: xiabiao = xiabiao-1 one_duixiang_qian_list.append(quanji_list[xiabiao-1]) #打印实体前半部分 print(one_duixiang_qian_list) xiabiao = int(one_list[0]) #重新获取下标 #存储一个实体对象的后半部分 one_duixiang_hou_list = [] #从下标开始,往下读取,读到第一个 “dataType”,然后dataType下第一个“}”终止 biaozhi = False #是否找到“dataType”的标识,默认为False,找到后置为True while True: xiabiao = xiabiao + 1 one_duixiang_hou_list.append(quanji_list[xiabiao]) #保存内容 if "dataType".lower() in str(quanji_list[xiabiao]).lower(): print("已经找到第一个节点dataType,再往下找到}就终止") biaozhi = True if biaozhi: if "}".lower() in str(quanji_list[xiabiao]).lower(): break #终止循环 # 打印实体后半部分 print(one_duixiang_hou_list) one_duixiang_list = self.firstListDaoXuTOSecondList(first_list=one_duixiang_qian_list, second_list=one_duixiang_list) one_duixiang_list = self.firstListZhengXuTOSecondList(first_list=one_duixiang_hou_list, second_list=one_duixiang_list) print("one_duixiang_list:") print(one_duixiang_list) return one_duixiang_list #获取文件的名字 #获取文件的后缀 #根据筛选类型后输出的文件,从中再次筛选标题内容 def getRoomLiveTitle(self,ziduan="dataType",ziduanzhi="4",quchongziduan="roomLiveTitle"): mubiao_file = self.handleFile(ziduan=ziduan,ziduanzhi=ziduanzhi) #读取目标文件中的所有roomLiveTitle的值 with open(file=mubiao_file, mode='r',encoding="utf-8") as f: mubiaofile_list = f.readlines() #存储所有带有"roomLiveTitle"的行 all_roomLiveTitle_list = [] #存储没有重复的RoomLiveTitle的值 roomLiveTitle_wuchongfu_list = [] #存储没有重复的RoomLiveTitle的值带下标 roomLiveTitle_wuchongfu_with_xiaobiao_list = [] #存储重复的RoomLiveTitle的值 roomLiveTitle_chongfu_list = [] #存储重复的RoomLiveTitle的值 roomLiveTitle_chongfu_with_xiaobiao_list = [] for i in range(0,len(mubiaofile_list)): if quchongziduan.lower() in str(mubiaofile_list[i]).lower(): #存储mubiaofile_list到 all_roomLiveTitle_list.append([i,mubiaofile_list[i]]) #获取all_roomLiveTitle_list的长度 all_roomLiveTitle_list_len = len(all_roomLiveTitle_list) #遍历all_roomLiveTitle_list for i in range(0,all_roomLiveTitle_list_len): one_list = all_roomLiveTitle_list[i] if one_list[1] not in roomLiveTitle_wuchongfu_list: roomLiveTitle_wuchongfu_list.append(one_list[1]) else: roomLiveTitle_chongfu_with_xiaobiao_list.append(one_list) #打印重复的标题数列 print("roomLiveTitle_chongfu_with_xiaobiao_list:") print(roomLiveTitle_chongfu_with_xiaobiao_list) if len(roomLiveTitle_chongfu_with_xiaobiao_list)<1: print("无重复【%s】"%quchongziduan) else: print("获取重复的内容:") #存储所有重复的内容 all_chongfu_shiti_list = [] #存储所有重复的实体({})内容的字符串 all_one_shiti_list_str_list = [] for i in range(0,len(roomLiveTitle_chongfu_with_xiaobiao_list)): print("循环获取内容") one_shiti_list = self.getOneShiTiByOneZiduan(one_list=roomLiveTitle_chongfu_with_xiaobiao_list[i], quanji_list=mubiaofile_list) # 获取去重后的内容中一个实体内容的字符串 one_chongfu_shiti_list_str = "".join(one_shiti_list) #添加one_chongfu_shiti_list_str到all_one_shiti_list_str_list中 all_one_shiti_list_str_list.append(one_chongfu_shiti_list_str) all_chongfu_shiti_list = self.firstListZhengXuTOSecondList(first_list=one_shiti_list, second_list=all_chongfu_shiti_list) print("all_chongfu_shiti_list:") print(all_chongfu_shiti_list) chongfu_file_name = str(mubiao_file).strip(".txt")+"_chongfu"+".txt" self.writeListToTxt(file_name=chongfu_file_name, list_data=all_chongfu_shiti_list) #获取所有内容组成一个字符串 mubiaofile_list_str = "".join(mubiaofile_list) #遍历替换重复的内容为空 for one in all_one_shiti_list_str_list: mubiaofile_list_str = mubiaofile_list_str.replace(one,"") #替换重复的内容为空 #将替换后的内容赋值给quchonghou quchonghou = mubiaofile_list_str #将去重后的内容保存到文件中 quchong_file_name = str(mubiao_file).strip(".txt") + "_quchong" + ".txt" self.writeStrToTxt(file_name=quchong_file_name, str=quchonghou) if __name__ == '__main__': #获取某个dataType的数据 file_name = "new 2.txt" ziduan = "dataType" ziduanzhi = "4" quchongziduan = "roomLiveTitle" sx = ShaiXuanLeiXingAndBiaoTi(file_name) sx.getRoomLiveTitle(ziduan=ziduan,ziduanzhi=ziduanzhi,quchongziduan=quchongziduan)
标签:roomLiveTitle,mubiao,self,list,len,特定,print,dateType 来源: https://www.cnblogs.com/jingzaixin/p/16584750.html