百度搜索当天收录采集
作者:互联网
#!/usr/bin/env python # -*- coding: utf-8 -*- # author:么么哒 import requests import re def Reptile(): for num in range(0,750,10): with open('test.txt', 'r', encoding='utf-8') as f: for text in f.read().splitlines(): target = 'https://www.baidu.com/s?wd={}&pn={}&ie=utf-8&gpc=stf%3D1658043774%2C1658130174%7Cstftype%3D1'.format(text,num) headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'} cookie = "你的cookie" cookie_dict = {i.split("=")[0]: i.split("=")[-1] for i in cookie.split("; ")} r = requests.get(url=target,headers=headers,cookies=cookie_dict) meme = r.text pattern = re.compile(r'","urlDisplay":"(.*?)","urlEncoded":"') result = re.findall(pattern,meme) #print(result) print(target) with open('./baidu-today.txt','a+',encoding = 'utf-8') as f1: for x in (result): try: pattern = re.compile(r'http(.*?)://([A-Za-z0-9]+[\-]?[A-Za-z0-9]+\.|[A-Za-z0-9]+\.)((\w|\?|\.|-)*)') s = str(x) print(s) m =(pattern.search(s).group(0)) m = str(m)+'\r' f1.write(m) except Exception as e: print (e) def filter(): try: with open('./baidu-today.txt', 'r') as f2:#打开文本过滤重复的url f_list = f2.readlines() set_list = list(set(f_list)) set_list.sort(key=f_list.index) for mm in (set_list): with open('./baidu-today去重后.txt','a+',encoding = 'utf-8') as f2: f2.write(mm) except Exception as e: print (e) finally: print ("恭喜你 去重复结束!") if __name__ == "__main__": Reptile() filter()
标签:__,baidu,utf,list,采集,cookie,收录,print,百度 来源: https://www.cnblogs.com/chrales/p/16490745.html