爬虫-姓名测试打分2
作者:互联网
一、获取汉字
import pandas as pd import requests from bs4 import BeautifulSoup session=requests.session() #http://xh.5156edu.com/pinyi.html 所有拼音的导航地址 #https://www.xingming.com/dafen/ 测试得分。 ⺋ url1="http://xh.5156edu.com/pinyi.html" headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE' } r1=session.get(url1, headers=headers) soup = BeautifulSoup(r1.content, 'lxml') list1 = soup.select("tr > td > a.fontbox") list2=[] # 每个拼音连接 for i in list1: list2.append([i.get("href"),i.text.strip()]) def f2(url2):#返回汉字 #url2 = "http://xh.5156edu.com/html2/p105.html" r2=session.get(url2, headers=headers) r2.encoding = 'gb18030' soup = BeautifulSoup(r2.text, 'lxml') list3 = soup.select("a.fontbox") list4 = [] for i in list3: list4.append(i.text[0]) return list4 import time list5=[] for i in list2: i2 = "http://xh.5156edu.com/"+i[0] print(i2) list5.append(f2(i2)) time.sleep(1) #写出汉字 with open("hanzi.txt","w",encoding="utf8") as f: for i in list5: f.write("|".join(i)+"\n") f.close()View Code
二、获取打分网站的评分
# -*- coding: utf-8 -*- """ Created on Sun Nov 21 22:31:06 2021 @author: Administrator """ import pandas as pd import requests from bs4 import BeautifulSoup session=requests.session() headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE' } with open("hanzi.txt","r",encoding="utf8") as f: list1 = f.readlines() f.close() # 展开所有字典 21763 list2=[] for i in list1: i2 = i.strip().split("|") if len(i2)>0: list2.extend(i2) # def ff3(ming): #ming="堂" url3 = "https://www.xingming.com/dafen/" dict0={'xs': '李', 'mz': f'金{ming}', 'action': 'test'} r4 = requests.post(url3,data=dict0, headers=headers) soup = BeautifulSoup(r4.content, 'lxml') try: score = soup.select("font[color='ff0000']")[0].text except IndexError : score = soup.text[:15] return score ming="⺋" ff3(ming) ming="⺋" url3 = "https://www.xingming.com/dafen/" dict0={'xs': '李', 'mz': f'金{ming}', 'action': 'test'} r4 = requests.post(url3,data=dict0, headers=headers) soup = BeautifulSoup(r4.content, 'lxml') try: 1/0 score = soup.select("font[color='ff0000']")[0].text except IndexError : score = soup.text[:15] score df1 = pd.DataFrame([[i,None] for i in list2]) df1.columns=['1','2'] df1 = df1.drop_duplicates().reset_index(drop=True).copy() df2 = df1[df1['2']!=None].copy() import datetime,time for i in range(df2.shape[0]): now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') ming = df2.iloc[i,0] if df2.iloc[i,1]!=None: continue try: soc = ff3(ming) try: soc=float(soc) df2.iloc[i,1] = soc except: df2.iloc[i,1] = soc except Exception as e: print(now_time,"----err---",str(e),df2.iloc[i,0]) if i%100 == 0: print(now_time,"----------",i) time.sleep(0.2) df2.to_excel("soc.xlsx") df3 = df2
标签:soup,爬虫,df2,headers,姓名,import,ming,com,打分 来源: https://www.cnblogs.com/andylhc/p/15832646.html