海关爬虫7代(圣佛版)
作者:互联网
# 优化了分组信息
import requests
from fake_useragent import UserAgent
from lxml import etree
from time import sleep
from random import randint
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
#from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from multiprocessing import Process
import threading
import re
from selenium.webdriver.chrome.options import Options
year = [2017,2018]
month = [i for i in range(1,13)]
country_code_name_dataform = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\贸易伙伴参数导出.csv',encoding='GBK')
province_code_name_dataform = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\收发货人注册地参数导出.csv',encoding='GBK')
trade_code_type_dataform = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\贸易方式参数导出.csv',encoding='GBK')
country_code = country_code_name_dataform.iloc[:,0].tolist() #转列表
province_code = province_code_name_dataform.iloc[:,0].tolist()
trade_type_code = trade_code_type_dataform.iloc[:,0].tolist()
print(country_code)
print(province_code)
print(trade_type_code)
country_t_code = ['101%2c102%2c103%2c104%2c105%2c106%2c107%2c108%2c109%2c110%2c111%2c112%2c113%2c114%2c115%2c116%2c117%2c118%2c119%2c120%2c121%2c122%2c123%2c124%2c125%2c126%2c127%2c128%2c129%2c130%2c131%2c132%2c133%2c134%2c135%2c136%2c137%2c138%2c139%2c140%2c141%2c142%2c143%2c144%2c145%2c146%2c147%2c148%2c149%2c199%2c201%2c202%2c203%2c204%2c205%2c206%2c207%2c208%2c209%2c210%2c211%2c212%2c213%2c214%2c215%2c216%2c217%2c218%2c219%2c220%2c221%2c222%2c223%2c224%2c225%2c226%2c227%2c228%2c229%2c230%2c231%2c232%2c233%2c234%2c235%2c236%2c237%2c238%2c239%2c240%2c241%2c242%2c243%2c244%2c245%2c246%2c247%2c248%2c249%2c299%2c250%2c251%2c252%2c253%2c254%2c255%2c256%2c257%2c258%2c259%2c260%2c301%2c302%2c303%2c304%2c305%2c306%2c307%2c308%2c309%2c310%2c311%2c312%2c313%2c314%2c315%2c316%2c317%2c318%2c319%2c320%2c321%2c322%2c323%2c324%2c325%2c326%2c327%2c328%2c329%2c330%2c331%2c332%2c333%2c334',
'335%2c336%2c337%2c338%2c339%2c340%2c341%2c342%2c343%2c344%2c345%2c346%2c347%2c348%2c349%2c350%2c351%2c352%2c353%2c354%2c355%2c356%2c357%2c358%2c359%2c399',
'401%2c402%2c403%2c404%2c405%2c406%2c407%2c408%2c409%2c410%2c411%2c412%2c413%2c414%2c415%2c416%2c417%2c418%2c419%2c420%2c421%2c422%2c423%2c424%2c425%2c426%2c427%2c428%2c429%2c430%2c431%2c432%2c433%2c434%2c435%2c436%2c437%2c438%2c439%2c440%2c441%2c442%2c443%2c444%2c445%2c446%2c447%2c448%2c449%2c499%2c501',
'502%2c503%2c504%2c599%2c601%2c602%2c603%2c604%2c605%2c606%2c607%2c608%2c609%2c610%2c611%2c612%2c613%2c614%2c615%2c616%2c617%2c618%2c619%2c620%2c621%2c622%2c623%2c624%2c625%2c699%2c701%2c702%2c999']
url_base = 'http://43.248.49.97/queryData/queryDataList?pageSize=20000&pageNum=1&iEType=1¤cyType=rmb&year={year}&startMonth={month}&endMonth={month}&monthFlag=1&unitFlag=true&codeLength=8&outerField1=CODE_TS&outerField2=ORIGIN_COUNTRY&outerField3=TRADE_MODE&outerField4=TRADE_CO_PORT&outerValue1=&outerValue2={country}&outerValue3=&outerValue4={province}&orderType=CODE+ASC+DEFAULT&selectTableState=2¤tStartTime=201903'
def url_manger(year,month,province_code,country_t_code,url_base):
request_url = []
for y in year:
for m in month:
for p in province_code:
for index,c_url in enumerate(country_t_code):
request_url.append(url_base.format(year=y,month=m,province=p,country=''.join(c_url)))
f_link = {'爬取链接汇总':request_url}
f_link_df = pd.DataFrame(f_link)
f_link_df.to_csv('爬取链接汇总.csv', encoding='GBK')
return request_url
def web_engine():
global request_url
global html_response
global h
url_send = request_url.pop()
url_txt_info = re.findall(r'.+ype=rmb&year=(.+)&startMonth=(.+)&endMonth=.+&monthFlag=1&unitFlag=true&codeLength=8&outerField1=CODE_TS&outerField2=ORIGIN_COUNTRY&outerField3=TRADE_MODE&outerField4=TRADE_CO_PORT&outerValue1=&outerValue2=(.+)&outerValue3=&outerValue4=(.+)&orderType=CODE.+',url_send)
y = url_txt_info[0][0]
m = url_txt_info[0][1]
cs_code = url_txt_info[0][2]
p = url_txt_info[0][3]
for index,compar_code in enumerate(country_t_code):
if cs_code == compar_code:
c = index
else:
continue
options = Options()
options.add_argument('--headless')
# options.add_argument('--disable-gpu')
# options.add_argument('user-agent="Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20"')
UA = UserAgent().edge
options.add_argument('''user-agent='{}' '''.format(UA))
p_i = randint(0, len(proxys) - 1)
proxy = proxys[p_i]
# options.add_argument('''proxy-server={}'''.format(proxy)) # 124.236.111.11:80
options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
edge = webdriver.Chrome(options=options) # executable_path="D:\Program Files\python3.7\chromedriver.exe"
edge.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver',{
get: () => undefined
})
"""
})
edge.implicitly_wait(100)
sleep(2)
edge.get(url_send)
sleep(3)
print('--启动-'*15)
# edge.find_element(By.XPATH, '''//*[@id="div1"]/div/div/div/div[1]/p/span''').is_displayed()
try:
# WebDriverWait(edge, timeout=180, poll_frequency=1.5, ignored_exceptions=None).until(
# EC.visibility_of_any_elements_located((By.XPATH, '''//*[@id="div1"]/div/div/div/div[1]/p/span''')))
WebDriverWait(edge, timeout=20, poll_frequency=0.5, ignored_exceptions=None).until(
EC.visibility_of_any_elements_located((By.XPATH, '''/html/body/div/table/tbody/tr[1]/td[1]/label''')))
WebDriverWait(edge, timeout=330, poll_frequency=0.5, ignored_exceptions=None).until_not(
EC.visibility_of_any_elements_located((By.XPATH, '''//*[@id="test"]''')))
# element = WebDriverWait(edge,timeout=180,poll_frequency=1.5,ignored_exceptions=None).until(
# edge.find_element(By.XPATH, '''//*[@id="div1"]/div/div/div/div[1]/p/span''').is_displayed()) #这个是错误的用法
html_response = edge.page_source
e = etree.HTML(html_response)
data_date = e.xpath('''//tbody[@id = 'div1']/tr/td[1]/div/text()''')
goods_code = e.xpath('''//tbody[@id = 'div1']/tr/td[2]/div/text()''')
goods_name = e.xpath('''//tbody[@id = 'div1']/tr/td[3]/div/text()''')
partner_code = e.xpath('''//tbody[@id = 'div1']/tr/td[4]/div/text()''')
partner_name = e.xpath('''//tbody[@id = 'div1']/tr/td[5]/div/text()''')
trade_code = e.xpath('''//tbody[@id = 'div1']/tr/td[6]/div/text()''')
trade_name = e.xpath('''//tbody[@id = 'div1']/tr/td[7]/div/text()''')
in_province_code = e.xpath('''//tbody[@id = 'div1']/tr/td[8]/div/text()''')
in_province_name = e.xpath('''//tbody[@id = 'div1']/tr/td[9]/div/text()''')
first_unit_num = e.xpath('''//tbody[@id = 'div1']/tr/td[10]/div/text()''')
first_unit_name = e.xpath('''//tbody[@id = 'div1']/tr/td[11]/div/text()''')
second_unit_num = e.xpath('''//tbody[@id = 'div1']/tr/td[12]/div/text()''')
second_unit_name = e.xpath('''//tbody[@id = 'div1']/tr/td[13]/div/text()''')
rmb_value = e.xpath('''//tbody[@id = 'div1']/tr/td[14]/div/text()''')
all_info = {
'数据年月': data_date,
'商品编码': goods_code,
'商品名称': goods_name,
'贸易伙伴编码': partner_code,
'贸易伙伴名称': partner_name,
'贸易方式编码': trade_code,
'贸易方式名称': trade_name,
'注册地编码': in_province_code,
'注册地名称': in_province_name,
'第一数量': first_unit_num,
'第一计量单位': first_unit_name,
'第二数量': second_unit_num,
'第二计量单位': second_unit_name,
'人民币': rmb_value
}
outdata = pd.DataFrame(all_info)
outdata.to_csv('{0}年{1}月{2}省市{3}国家进口数据.csv'.format(y, m, p, c), encoding='GBK')
edge.quit()
h += 1
with open('爬取成功链接列表.txt', 'a') as f_success:
f_success.write(url_send + '\n')
f_success.close()
print('*' * 100)
lenth = len(data_date)
print('成功', y, '\t', m, '\t', p, '\t', c,'\t','数据长度:',lenth)
print('已经爬取:', h, '\t', '共计:', kill_num, '\t', '本次比例:', h / kill_num * 100, '%')
print('*!' * 50)
except:
with open('爬取失败链接列表.txt','a') as f_fail:
f_fail.write(url_send+'\n')
f_fail.close()
url_send = [url_send]
request_url = url_send+request_url
edge.quit()
print('~' * 100)
print('失败', y, '\t', m, '\t', p, '\t', c)
print('~!' * 50)
if __name__ == '__main__':
proxys = ['http://124.236.111.11:80','http://140.143.142.200:1080','https://123.149.136.245:9999','https://115.221.246.157:9999',
'https://115.221.244.206:9999','https://58.220.95.30:10174','https://175.42.128.5:9999','https://36.112.139.146:3128',
'http://1.196.177.100:9999','https://110.243.16.93:9999']
request_url = url_manger(year, month, province_code, country_t_code, url_base)
with open('爬取成功链接列表.txt','r') as f_set_success:
used_request_url = f_set_success.read()
f_set_success.close()
used_request_url_set = set(used_request_url.split('\n')) #转列表转集合
request_url_set = set(request_url)
request_url_set_end = request_url_set - used_request_url_set #去重
request_url = list(request_url_set_end)
kill_num = len(request_url)
html_response = 'kkkk'
# print(len(request_url))
# web_engine1 = web_engine(request_url, timeout=500, frequency=1)
h=0
z_h = 0
while True:
w1 = threading.Thread(target=web_engine)
w2 = threading.Thread(target=web_engine)
w3 = threading.Thread(target=web_engine)
w4 = threading.Thread(target=web_engine)
w5 = threading.Thread(target=web_engine)
w6 = threading.Thread(target=web_engine)
w7 = threading.Thread(target=web_engine)
# w8 = threading.Thread(target=web_engine)
# w9 = threading.Thread(target=web_engine)
# w10 = threading.Thread(target=web_engine)
w1.start()
sleep(randint(10, 30))
# sleep(5)
w2.start()
sleep(randint(10, 30))
# sleep(5)
w3.start()
sleep(randint(10, 30))
# sleep(5)
w4.start()
# w1.join()
sleep(randint(10, 30))
# sleep(5)
# w5.start()
# sleep(randint(10, 30))
# sleep(7)
# sleep(randint(20, 30))
# w6.start()
# sleep(randint(10, 30))
# w7.start()
# w8.start()
# w5.join()
if h % 30 == 8:
sleep(120)
if h > kill_num:
print('已经爬取:',h,'\t','共计:',kill_num,'\t','比例:',h/kill_num*100,'%')
break
print('爬虫完成')
#加入cookie,去掉domain加cookie方法
import requests
from fake_useragent import UserAgent
from lxml import etree
from time import sleep
from random import randint
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
#from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from multiprocessing import Process
import threading
import re
from tqdm import tqdm
from selenium.webdriver.chrome.options import Options
year = [2017,2018]
month = [i for i in range(1,13)]
country_code_name_dataform = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\贸易伙伴参数导出.csv',encoding='GBK')
province_code_name_dataform = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\收发货人注册地参数导出.csv',encoding='GBK')
trade_code_type_dataform = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\贸易方式参数导出.csv',encoding='GBK')
country_code = country_code_name_dataform.iloc[:,0].tolist() #转列表
province_code = province_code_name_dataform.iloc[:,0].tolist()
trade_type_code = trade_code_type_dataform.iloc[:,0].tolist()
print(country_code)
print(province_code)
print(trade_type_code)
country_t_code = ['101%2c102%2c103%2c104%2c105%2c106%2c107%2c108%2c109%2c110%2c111%2c112%2c113%2c114%2c115%2c116%2c117%2c118%2c119%2c120%2c121%2c122%2c123%2c124%2c125%2c126%2c127%2c128%2c129%2c130%2c131%2c132%2c133%2c134%2c135%2c136%2c137%2c138%2c139%2c140%2c141%2c142%2c143%2c144%2c145%2c146%2c147%2c148%2c149%2c199%2c201%2c202%2c203%2c204%2c205%2c206%2c207%2c208%2c209%2c210%2c211%2c212%2c213%2c214%2c215%2c216%2c217%2c218%2c219%2c220%2c221%2c222%2c223%2c224%2c225%2c226%2c227%2c228%2c229%2c230%2c231%2c232%2c233%2c234%2c235%2c236%2c237%2c238%2c239%2c240%2c241%2c242%2c243%2c244%2c245%2c246%2c247%2c248%2c249%2c299%2c250%2c251%2c252%2c253%2c254%2c255%2c256%2c257%2c258%2c259%2c260%2c301%2c302%2c303%2c304%2c305%2c306%2c307%2c308%2c309%2c310%2c311%2c312%2c313%2c314%2c315%2c316%2c317%2c318%2c319%2c320%2c321%2c322%2c323%2c324%2c325%2c326%2c327%2c328%2c329%2c330%2c331%2c332%2c333%2c334',
'335%2c336%2c337%2c338%2c339%2c340%2c341%2c342%2c343%2c344%2c345%2c346%2c347%2c348%2c349%2c350%2c351%2c352%2c353%2c354%2c355%2c356%2c357%2c358%2c359%2c399',
'401%2c402%2c403%2c404%2c405%2c406%2c407%2c408%2c409%2c410%2c411%2c412%2c413%2c414%2c415%2c416%2c417%2c418%2c419%2c420%2c421%2c422%2c423%2c424%2c425%2c426%2c427%2c428%2c429%2c430%2c431%2c432%2c433%2c434%2c435%2c436%2c437%2c438%2c439%2c440%2c441%2c442%2c443%2c444%2c445%2c446%2c447%2c448%2c449%2c499%2c501',
'502%2c503%2c504%2c599%2c601%2c602%2c603%2c604%2c605%2c606%2c607%2c608%2c609%2c610%2c611%2c612%2c613%2c614%2c615%2c616%2c617%2c618%2c619%2c620%2c621%2c622%2c623%2c624%2c625%2c699%2c701%2c702%2c999']
url_base = 'http://43.248.49.97/queryData/queryDataList?pageSize=20000&pageNum=1&iEType=1¤cyType=rmb&year={year}&startMonth={month}&endMonth={month}&monthFlag=1&unitFlag=true&codeLength=8&outerField1=CODE_TS&outerField2=ORIGIN_COUNTRY&outerField3=TRADE_MODE&outerField4=TRADE_CO_PORT&outerValue1=&outerValue2={country}&outerValue3=&outerValue4={province}&orderType=CODE+ASC+DEFAULT&selectTableState=2¤tStartTime=201903'
def url_manger(year,month,province_code,country_t_code,url_base):
request_url = []
for y in year:
for m in month:
for p in province_code:
for index,c_url in enumerate(country_t_code):
request_url.append(url_base.format(year=y,month=m,province=p,country=''.join(c_url)))
f_link = {'爬取链接汇总':request_url}
f_link_df = pd.DataFrame(f_link)
f_link_df.to_csv('爬取链接汇总.csv', encoding='GBK')
return request_url
def web_engine():
global request_url
global html_response
global h
url_send = request_url.pop()
url_txt_info = re.findall(r'.+ype=rmb&year=(.+)&startMonth=(.+)&endMonth=.+&monthFlag=1&unitFlag=true&codeLength=8&outerField1=CODE_TS&outerField2=ORIGIN_COUNTRY&outerField3=TRADE_MODE&outerField4=TRADE_CO_PORT&outerValue1=&outerValue2=(.+)&outerValue3=&outerValue4=(.+)&orderType=CODE.+',url_send)
y = url_txt_info[0][0]
m = url_txt_info[0][1]
cs_code = url_txt_info[0][2]
p = url_txt_info[0][3]
for index,compar_code in enumerate(country_t_code):
if cs_code == compar_code:
c = index
else:
continue
options = Options()
options.add_argument('--headless')
# options.add_argument('--disable-gpu')
# options.add_argument('user-agent="Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20"')
UA = UserAgent().chrome
options.add_argument('''user-agent='{}' '''.format(UA))
p_i = randint(0, len(proxys) - 1)
proxy = proxys[p_i]
# options.add_argument('''proxy-server={}'''.format(proxy)) # 124.236.111.11:80
options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
edge = webdriver.Chrome(options=options) # executable_path="D:\Program Files\python3.7\chromedriver.exe"
cookie_dict = {
'expiry': 1606620717,
'httpOnly': False,
'name': '9CKCOkIaqzqET',
'path': '/',
'secure': False,
'value': '5Y.k1NYFa0jVxcAfZya710GyNOswSINgrExzcBSh69V3b_3VvaJSG2Acij66UYQ2oH0JicC0V0LUSYYipkbwWtIy3qMMfBI4dj6T_5a4oFj1ROpXPdp2IMS2B2BACcbbDZOMIF2r0incao6q5gHO3dpmE8sLIsLuTdOBDcAcIpL_40_nUBbzFGi5H697kMIQqXy.Fk8l1gb8b2x_rMFtM4VgZnA6dJ8PrSLUFk.RjHLmAj2VCF8rVaJhCFqXirP1Kl'
}
edge.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver',{
get: () => undefined
})
"""
})
edge.get('http://43.248.49.97/')
sleep(5)
cookie_dict = {
'expiry': 1606620717,
'httpOnly': False,
'name': '9CKCOkIaqzqET',
'path': '/',
'secure': False,
'value': '5Y.k1NYFa0jVxcAfZya710GyNOswSINgrExzcBSh69V3b_3VvaJSG2Acij66UYQ2oH0JicC0V0LUSYYipkbwWtIy3qMMfBI4dj6T_5a4oFj1ROpXPdp2IMS2B2BACcbbDZOMIF2r0incao6q5gHO3dpmE8sLIsLuTdOBDcAcIpL_40_nUBbzFGi5H697kMIQqXy.Fk8l1gb8b2x_rMFtM4VgZnA6dJ8PrSLUFk.RjHLmAj2VCF8rVaJhCFqXirP1Kl'
}
edge.add_cookie(cookie_dict = cookie_dict)
edge.implicitly_wait(100)
sleep(2)
edge.get(url_send)
sleep(3)
print('--启动-'*15)
try:
WebDriverWait(edge, timeout=40, poll_frequency=0.5, ignored_exceptions=None).until(
EC.visibility_of_any_elements_located((By.XPATH, '''/html/body/div/table/tbody/tr[1]/td[1]/label''')))
WebDriverWait(edge, timeout=330, poll_frequency=0.5, ignored_exceptions=None).until_not(
EC.visibility_of_any_elements_located((By.XPATH, '''//*[@id="test"]''')))
# element = WebDriverWait(edge,timeout=180,poll_frequency=1.5,ignored_exceptions=None).until(
# edge.find_element(By.XPATH, '''//*[@id="div1"]/div/div/div/div[1]/p/span''').is_displayed()) #这个是错误的用法
html_response = edge.page_source
e = etree.HTML(html_response)
data_date = e.xpath('''//tbody[@id = 'div1']/tr/td[1]/div/text()''')
goods_code = e.xpath('''//tbody[@id = 'div1']/tr/td[2]/div/text()''')
goods_name = e.xpath('''//tbody[@id = 'div1']/tr/td[3]/div/text()''')
partner_code = e.xpath('''//tbody[@id = 'div1']/tr/td[4]/div/text()''')
partner_name = e.xpath('''//tbody[@id = 'div1']/tr/td[5]/div/text()''')
trade_code = e.xpath('''//tbody[@id = 'div1']/tr/td[6]/div/text()''')
trade_name = e.xpath('''//tbody[@id = 'div1']/tr/td[7]/div/text()''')
in_province_code = e.xpath('''//tbody[@id = 'div1']/tr/td[8]/div/text()''')
in_province_name = e.xpath('''//tbody[@id = 'div1']/tr/td[9]/div/text()''')
first_unit_num = e.xpath('''//tbody[@id = 'div1']/tr/td[10]/div/text()''')
first_unit_name = e.xpath('''//tbody[@id = 'div1']/tr/td[11]/div/text()''')
second_unit_num = e.xpath('''//tbody[@id = 'div1']/tr/td[12]/div/text()''')
second_unit_name = e.xpath('''//tbody[@id = 'div1']/tr/td[13]/div/text()''')
rmb_value = e.xpath('''//tbody[@id = 'div1']/tr/td[14]/div/text()''')
all_info = {
'数据年月': data_date,
'商品编码': goods_code,
'商品名称': goods_name,
'贸易伙伴编码': partner_code,
'贸易伙伴名称': partner_name,
'贸易方式编码': trade_code,
'贸易方式名称': trade_name,
'注册地编码': in_province_code,
'注册地名称': in_province_name,
'第一数量': first_unit_num,
'第一计量单位': first_unit_name,
'第二数量': second_unit_num,
'第二计量单位': second_unit_name,
'人民币': rmb_value
}
outdata = pd.DataFrame(all_info)
outdata.to_csv('{0}年{1}月{2}省市{3}国家进口数据.csv'.format(y, m, p, c), encoding='GBK')
edge.quit()
h += 1
with open('爬取成功链接列表.txt', 'a') as f_success:
f_success.write(url_send + '\n')
f_success.close()
print('*' * 100)
lenth = len(data_date)
print('成功', y, '\t', m, '\t', p, '\t', c,'\t','数据长度:',lenth)
print('已经爬取:', h, '\t', '共计:', kill_num, '\t', '本次比例:', h / kill_num * 100, '%')
print('*!' * 50)
except:
with open('爬取失败链接列表.txt','a') as f_fail:
f_fail.write(url_send+'\n')
f_fail.close()
url_send = [url_send]
request_url = url_send+request_url
edge.quit()
print('~' * 100)
print('失败', y, '\t', m, '\t', p, '\t', c)
print('~!' * 50)
if __name__ == '__main__':
proxys =['http://61.135.169.121:80']# ['http://117.185.17.151:80']#['http://61.135.185.31:80']#['http://61.135.185.31:80']#['http://61.135.169.121:80']#['http://122.147.141.151:80']#['http://180.149.144.224:80']#['http://180.97.34.35:80'] # ['http://123.125.114.107:80']['http://180.97.34.35:80']# ['http://119.147.210.236:3128']# #['http://122.147.141.151:80'] # ['http://101.132.143.232:80']# ['http://218.59.139.238:80'] # ['http://43.254.221.27:80'] ['http://221.182.31.54:8080']
# ['http://124.236.111.11:80','http://140.143.142.200:1080','https://123.149.136.245:9999','https://115.221.246.157:9999',
# 'https://115.221.244.206:9999','https://58.220.95.30:10174','https://175.42.128.5:9999','https://36.112.139.146:3128',
# 'http://1.196.177.100:9999','https://110.243.16.93:9999']
request_url = url_manger(year, month, province_code, country_t_code, url_base)
with open('爬取成功链接列表.txt','r') as f_set_success:
used_request_url = f_set_success.read()
f_set_success.close()
used_request_url_set = set(used_request_url.split('\n')) #转列表转集合
request_url_set = set(request_url)
request_url_set_end = request_url_set - used_request_url_set #去重
request_url = list(request_url_set_end)
kill_num = len(request_url)
html_response = 'kkkk'
h=0
z_h = 0
while True:
w1 = threading.Thread(target=web_engine)
w2 = threading.Thread(target=web_engine)
w3 = threading.Thread(target=web_engine)
w4 = threading.Thread(target=web_engine)
# w5 = threading.Thread(target=web_engine)
# w6 = threading.Thread(target=web_engine)
# w7 = threading.Thread(target=web_engine)
# w8 = threading.Thread(target=web_engine)
# w9 = threading.Thread(target=web_engine)
# w10 = threading.Thread(target=web_engine)
w1.start()
sleep(randint(30, 40))
# sleep(5)
w2.start()
sleep(randint(30, 40))
# sleep(5)
w3.start()
sleep(randint(30, 40))
# sleep(5)
w4.start()
w1.join()
w3.join()
print('睡眠ing')
sleep(randint(30, 40))
if h > kill_num:
print('已经爬取:',h,'\t','共计:',kill_num,'\t','比例:',h/kill_num*100,'%')
break
print('爬虫完成')
标签:code,name,url,request,爬虫,id,海关,div,圣佛版 来源: https://blog.csdn.net/qq_42830971/article/details/109771313