其他分享
首页 > 其他分享> > 海关爬虫7代(圣佛版)

海关爬虫7代(圣佛版)

作者:互联网

# 优化了分组信息
import requests
from fake_useragent import UserAgent
from lxml import etree
from time import sleep
from random import randint
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
#from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from multiprocessing import  Process
import threading
import re
from selenium.webdriver.chrome.options import Options
year = [2017,2018]
month = [i for i in range(1,13)]
country_code_name_dataform = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\贸易伙伴参数导出.csv',encoding='GBK')
province_code_name_dataform = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\收发货人注册地参数导出.csv',encoding='GBK')
trade_code_type_dataform = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\贸易方式参数导出.csv',encoding='GBK')
country_code = country_code_name_dataform.iloc[:,0].tolist() #转列表
province_code = province_code_name_dataform.iloc[:,0].tolist()
trade_type_code = trade_code_type_dataform.iloc[:,0].tolist()
print(country_code)
print(province_code)
print(trade_type_code)

country_t_code = ['101%2c102%2c103%2c104%2c105%2c106%2c107%2c108%2c109%2c110%2c111%2c112%2c113%2c114%2c115%2c116%2c117%2c118%2c119%2c120%2c121%2c122%2c123%2c124%2c125%2c126%2c127%2c128%2c129%2c130%2c131%2c132%2c133%2c134%2c135%2c136%2c137%2c138%2c139%2c140%2c141%2c142%2c143%2c144%2c145%2c146%2c147%2c148%2c149%2c199%2c201%2c202%2c203%2c204%2c205%2c206%2c207%2c208%2c209%2c210%2c211%2c212%2c213%2c214%2c215%2c216%2c217%2c218%2c219%2c220%2c221%2c222%2c223%2c224%2c225%2c226%2c227%2c228%2c229%2c230%2c231%2c232%2c233%2c234%2c235%2c236%2c237%2c238%2c239%2c240%2c241%2c242%2c243%2c244%2c245%2c246%2c247%2c248%2c249%2c299%2c250%2c251%2c252%2c253%2c254%2c255%2c256%2c257%2c258%2c259%2c260%2c301%2c302%2c303%2c304%2c305%2c306%2c307%2c308%2c309%2c310%2c311%2c312%2c313%2c314%2c315%2c316%2c317%2c318%2c319%2c320%2c321%2c322%2c323%2c324%2c325%2c326%2c327%2c328%2c329%2c330%2c331%2c332%2c333%2c334',
                  '335%2c336%2c337%2c338%2c339%2c340%2c341%2c342%2c343%2c344%2c345%2c346%2c347%2c348%2c349%2c350%2c351%2c352%2c353%2c354%2c355%2c356%2c357%2c358%2c359%2c399',

                '401%2c402%2c403%2c404%2c405%2c406%2c407%2c408%2c409%2c410%2c411%2c412%2c413%2c414%2c415%2c416%2c417%2c418%2c419%2c420%2c421%2c422%2c423%2c424%2c425%2c426%2c427%2c428%2c429%2c430%2c431%2c432%2c433%2c434%2c435%2c436%2c437%2c438%2c439%2c440%2c441%2c442%2c443%2c444%2c445%2c446%2c447%2c448%2c449%2c499%2c501',
                '502%2c503%2c504%2c599%2c601%2c602%2c603%2c604%2c605%2c606%2c607%2c608%2c609%2c610%2c611%2c612%2c613%2c614%2c615%2c616%2c617%2c618%2c619%2c620%2c621%2c622%2c623%2c624%2c625%2c699%2c701%2c702%2c999']
url_base = 'http://43.248.49.97/queryData/queryDataList?pageSize=20000&pageNum=1&iEType=1&currencyType=rmb&year={year}&startMonth={month}&endMonth={month}&monthFlag=1&unitFlag=true&codeLength=8&outerField1=CODE_TS&outerField2=ORIGIN_COUNTRY&outerField3=TRADE_MODE&outerField4=TRADE_CO_PORT&outerValue1=&outerValue2={country}&outerValue3=&outerValue4={province}&orderType=CODE+ASC+DEFAULT&selectTableState=2&currentStartTime=201903'
def url_manger(year,month,province_code,country_t_code,url_base):
    request_url = []
    for y in year:
        for m in month:
            for p in province_code:
                for index,c_url in enumerate(country_t_code):
                    request_url.append(url_base.format(year=y,month=m,province=p,country=''.join(c_url)))
    f_link = {'爬取链接汇总':request_url}
    f_link_df = pd.DataFrame(f_link)
    f_link_df.to_csv('爬取链接汇总.csv', encoding='GBK')
    return request_url


def web_engine():
    global request_url
    global html_response
    global h
    url_send = request_url.pop()
    url_txt_info = re.findall(r'.+ype=rmb&year=(.+)&startMonth=(.+)&endMonth=.+&monthFlag=1&unitFlag=true&codeLength=8&outerField1=CODE_TS&outerField2=ORIGIN_COUNTRY&outerField3=TRADE_MODE&outerField4=TRADE_CO_PORT&outerValue1=&outerValue2=(.+)&outerValue3=&outerValue4=(.+)&orderType=CODE.+',url_send)
    y = url_txt_info[0][0]
    m = url_txt_info[0][1]
    cs_code = url_txt_info[0][2]
    p = url_txt_info[0][3]
    for index,compar_code in enumerate(country_t_code):
        if cs_code == compar_code:
            c = index
        else:
            continue
    options = Options()
    options.add_argument('--headless')
    # options.add_argument('--disable-gpu')
    # options.add_argument('user-agent="Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20"')
    UA = UserAgent().edge
    options.add_argument('''user-agent='{}' '''.format(UA))
    p_i = randint(0, len(proxys) - 1)
    proxy = proxys[p_i]
 #   options.add_argument('''proxy-server={}'''.format(proxy))  # 124.236.111.11:80
    options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
    edge = webdriver.Chrome(options=options)  # executable_path="D:\Program Files\python3.7\chromedriver.exe"
    edge.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": """
            Object.defineProperty(navigator, 'webdriver',{
            get: () => undefined
            })
            """
            })

    edge.implicitly_wait(100)
    sleep(2)
    edge.get(url_send)
    sleep(3)
    print('--启动-'*15)
 #   edge.find_element(By.XPATH, '''//*[@id="div1"]/div/div/div/div[1]/p/span''').is_displayed()
    try:
    #    WebDriverWait(edge, timeout=180, poll_frequency=1.5, ignored_exceptions=None).until(
     #       EC.visibility_of_any_elements_located((By.XPATH, '''//*[@id="div1"]/div/div/div/div[1]/p/span''')))
        WebDriverWait(edge, timeout=20, poll_frequency=0.5, ignored_exceptions=None).until(
            EC.visibility_of_any_elements_located((By.XPATH, '''/html/body/div/table/tbody/tr[1]/td[1]/label''')))
        WebDriverWait(edge, timeout=330, poll_frequency=0.5, ignored_exceptions=None).until_not(
            EC.visibility_of_any_elements_located((By.XPATH, '''//*[@id="test"]''')))
    #    element = WebDriverWait(edge,timeout=180,poll_frequency=1.5,ignored_exceptions=None).until(
    #    edge.find_element(By.XPATH, '''//*[@id="div1"]/div/div/div/div[1]/p/span''').is_displayed())  #这个是错误的用法
        html_response = edge.page_source
        e = etree.HTML(html_response)
        data_date = e.xpath('''//tbody[@id = 'div1']/tr/td[1]/div/text()''')
        goods_code = e.xpath('''//tbody[@id = 'div1']/tr/td[2]/div/text()''')
        goods_name = e.xpath('''//tbody[@id = 'div1']/tr/td[3]/div/text()''')
        partner_code = e.xpath('''//tbody[@id = 'div1']/tr/td[4]/div/text()''')
        partner_name = e.xpath('''//tbody[@id = 'div1']/tr/td[5]/div/text()''')
        trade_code = e.xpath('''//tbody[@id = 'div1']/tr/td[6]/div/text()''')
        trade_name = e.xpath('''//tbody[@id = 'div1']/tr/td[7]/div/text()''')
        in_province_code = e.xpath('''//tbody[@id = 'div1']/tr/td[8]/div/text()''')
        in_province_name = e.xpath('''//tbody[@id = 'div1']/tr/td[9]/div/text()''')
        first_unit_num = e.xpath('''//tbody[@id = 'div1']/tr/td[10]/div/text()''')
        first_unit_name = e.xpath('''//tbody[@id = 'div1']/tr/td[11]/div/text()''')
        second_unit_num = e.xpath('''//tbody[@id = 'div1']/tr/td[12]/div/text()''')
        second_unit_name = e.xpath('''//tbody[@id = 'div1']/tr/td[13]/div/text()''')
        rmb_value = e.xpath('''//tbody[@id = 'div1']/tr/td[14]/div/text()''')
        all_info = {
            '数据年月': data_date,
            '商品编码': goods_code,
            '商品名称': goods_name,
            '贸易伙伴编码': partner_code,
            '贸易伙伴名称': partner_name,
            '贸易方式编码': trade_code,
            '贸易方式名称': trade_name,
            '注册地编码': in_province_code,
            '注册地名称': in_province_name,
            '第一数量': first_unit_num,
            '第一计量单位': first_unit_name,
            '第二数量': second_unit_num,
            '第二计量单位': second_unit_name,
            '人民币': rmb_value
        }
        outdata = pd.DataFrame(all_info)
        outdata.to_csv('{0}年{1}月{2}省市{3}国家进口数据.csv'.format(y, m, p, c), encoding='GBK')
        edge.quit()
        h += 1
        with open('爬取成功链接列表.txt', 'a') as f_success:
            f_success.write(url_send + '\n')
            f_success.close()
        print('*' * 100)
        lenth = len(data_date)
        print('成功', y, '\t', m, '\t', p, '\t', c,'\t','数据长度:',lenth)
        print('已经爬取:', h, '\t', '共计:', kill_num, '\t', '本次比例:', h / kill_num * 100, '%')
        print('*!' * 50)
    except:
        with open('爬取失败链接列表.txt','a') as f_fail:
            f_fail.write(url_send+'\n')
            f_fail.close()
            url_send = [url_send]
            request_url = url_send+request_url
            edge.quit()
            print('~' * 100)
            print('失败', y, '\t', m, '\t', p, '\t', c)
            print('~!' * 50)


if __name__ == '__main__':
    proxys = ['http://124.236.111.11:80','http://140.143.142.200:1080','https://123.149.136.245:9999','https://115.221.246.157:9999',
              'https://115.221.244.206:9999','https://58.220.95.30:10174','https://175.42.128.5:9999','https://36.112.139.146:3128',
              'http://1.196.177.100:9999','https://110.243.16.93:9999']
    request_url = url_manger(year, month, province_code, country_t_code, url_base)
    with open('爬取成功链接列表.txt','r') as f_set_success:
        used_request_url = f_set_success.read()
        f_set_success.close()
    used_request_url_set = set(used_request_url.split('\n')) #转列表转集合
    request_url_set = set(request_url)
    request_url_set_end = request_url_set - used_request_url_set #去重
    request_url = list(request_url_set_end)
    kill_num = len(request_url)
    html_response = 'kkkk'
 #  print(len(request_url))
#    web_engine1 = web_engine(request_url, timeout=500, frequency=1)
    h=0
    z_h = 0
    while True:
        w1 = threading.Thread(target=web_engine)
        w2 = threading.Thread(target=web_engine)
        w3 = threading.Thread(target=web_engine)
        w4 = threading.Thread(target=web_engine)
        w5 = threading.Thread(target=web_engine)
        w6 = threading.Thread(target=web_engine)
        w7 = threading.Thread(target=web_engine)
    #    w8 = threading.Thread(target=web_engine)
    #    w9 = threading.Thread(target=web_engine)
   #     w10 = threading.Thread(target=web_engine)

        w1.start()
        sleep(randint(10, 30))
     #   sleep(5)
        w2.start()
        sleep(randint(10, 30))
    #    sleep(5)
        w3.start()
        sleep(randint(10, 30))
    #    sleep(5)
        w4.start()
    #    w1.join()
        sleep(randint(10, 30))
     #   sleep(5)
    #    w5.start()
     #   sleep(randint(10, 30))
    #    sleep(7)
    #    sleep(randint(20, 30))
     #   w6.start()
     #   sleep(randint(10, 30))
   #     w7.start()
  #      w8.start()
   #     w5.join()
        if h % 30 == 8:
            sleep(120)
        if h > kill_num:
            print('已经爬取:',h,'\t','共计:',kill_num,'\t','比例:',h/kill_num*100,'%')
            break
    print('爬虫完成')




#加入cookie,去掉domain加cookie方法
import requests
from fake_useragent import UserAgent
from lxml import etree
from time import sleep
from random import randint
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
#from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from multiprocessing import  Process
import threading
import re
from tqdm import tqdm
from selenium.webdriver.chrome.options import Options
year = [2017,2018]
month = [i for i in range(1,13)]
country_code_name_dataform = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\贸易伙伴参数导出.csv',encoding='GBK')
province_code_name_dataform = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\收发货人注册地参数导出.csv',encoding='GBK')
trade_code_type_dataform = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\贸易方式参数导出.csv',encoding='GBK')
country_code = country_code_name_dataform.iloc[:,0].tolist() #转列表
province_code = province_code_name_dataform.iloc[:,0].tolist()
trade_type_code = trade_code_type_dataform.iloc[:,0].tolist()
print(country_code)
print(province_code)
print(trade_type_code)

country_t_code = ['101%2c102%2c103%2c104%2c105%2c106%2c107%2c108%2c109%2c110%2c111%2c112%2c113%2c114%2c115%2c116%2c117%2c118%2c119%2c120%2c121%2c122%2c123%2c124%2c125%2c126%2c127%2c128%2c129%2c130%2c131%2c132%2c133%2c134%2c135%2c136%2c137%2c138%2c139%2c140%2c141%2c142%2c143%2c144%2c145%2c146%2c147%2c148%2c149%2c199%2c201%2c202%2c203%2c204%2c205%2c206%2c207%2c208%2c209%2c210%2c211%2c212%2c213%2c214%2c215%2c216%2c217%2c218%2c219%2c220%2c221%2c222%2c223%2c224%2c225%2c226%2c227%2c228%2c229%2c230%2c231%2c232%2c233%2c234%2c235%2c236%2c237%2c238%2c239%2c240%2c241%2c242%2c243%2c244%2c245%2c246%2c247%2c248%2c249%2c299%2c250%2c251%2c252%2c253%2c254%2c255%2c256%2c257%2c258%2c259%2c260%2c301%2c302%2c303%2c304%2c305%2c306%2c307%2c308%2c309%2c310%2c311%2c312%2c313%2c314%2c315%2c316%2c317%2c318%2c319%2c320%2c321%2c322%2c323%2c324%2c325%2c326%2c327%2c328%2c329%2c330%2c331%2c332%2c333%2c334',
                  '335%2c336%2c337%2c338%2c339%2c340%2c341%2c342%2c343%2c344%2c345%2c346%2c347%2c348%2c349%2c350%2c351%2c352%2c353%2c354%2c355%2c356%2c357%2c358%2c359%2c399',

                '401%2c402%2c403%2c404%2c405%2c406%2c407%2c408%2c409%2c410%2c411%2c412%2c413%2c414%2c415%2c416%2c417%2c418%2c419%2c420%2c421%2c422%2c423%2c424%2c425%2c426%2c427%2c428%2c429%2c430%2c431%2c432%2c433%2c434%2c435%2c436%2c437%2c438%2c439%2c440%2c441%2c442%2c443%2c444%2c445%2c446%2c447%2c448%2c449%2c499%2c501',
                '502%2c503%2c504%2c599%2c601%2c602%2c603%2c604%2c605%2c606%2c607%2c608%2c609%2c610%2c611%2c612%2c613%2c614%2c615%2c616%2c617%2c618%2c619%2c620%2c621%2c622%2c623%2c624%2c625%2c699%2c701%2c702%2c999']
url_base = 'http://43.248.49.97/queryData/queryDataList?pageSize=20000&pageNum=1&iEType=1&currencyType=rmb&year={year}&startMonth={month}&endMonth={month}&monthFlag=1&unitFlag=true&codeLength=8&outerField1=CODE_TS&outerField2=ORIGIN_COUNTRY&outerField3=TRADE_MODE&outerField4=TRADE_CO_PORT&outerValue1=&outerValue2={country}&outerValue3=&outerValue4={province}&orderType=CODE+ASC+DEFAULT&selectTableState=2&currentStartTime=201903'
def url_manger(year,month,province_code,country_t_code,url_base):
    request_url = []
    for y in year:
        for m in month:
            for p in province_code:
                for index,c_url in enumerate(country_t_code):
                    request_url.append(url_base.format(year=y,month=m,province=p,country=''.join(c_url)))
    f_link = {'爬取链接汇总':request_url}
    f_link_df = pd.DataFrame(f_link)
    f_link_df.to_csv('爬取链接汇总.csv', encoding='GBK')
    return request_url


def web_engine():
    global request_url
    global html_response
    global h
    url_send = request_url.pop()
    url_txt_info = re.findall(r'.+ype=rmb&year=(.+)&startMonth=(.+)&endMonth=.+&monthFlag=1&unitFlag=true&codeLength=8&outerField1=CODE_TS&outerField2=ORIGIN_COUNTRY&outerField3=TRADE_MODE&outerField4=TRADE_CO_PORT&outerValue1=&outerValue2=(.+)&outerValue3=&outerValue4=(.+)&orderType=CODE.+',url_send)
    y = url_txt_info[0][0]
    m = url_txt_info[0][1]
    cs_code = url_txt_info[0][2]
    p = url_txt_info[0][3]
    for index,compar_code in enumerate(country_t_code):
        if cs_code == compar_code:
            c = index
        else:
            continue
    options = Options()
    options.add_argument('--headless')
    # options.add_argument('--disable-gpu')
    # options.add_argument('user-agent="Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20"')
    UA = UserAgent().chrome
    options.add_argument('''user-agent='{}' '''.format(UA))
    p_i = randint(0, len(proxys) - 1)
    proxy = proxys[p_i]
 #   options.add_argument('''proxy-server={}'''.format(proxy))  # 124.236.111.11:80
    options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
    edge = webdriver.Chrome(options=options)  # executable_path="D:\Program Files\python3.7\chromedriver.exe"
    cookie_dict = {
        'expiry': 1606620717,
        'httpOnly': False,
        'name': '9CKCOkIaqzqET',
        'path': '/',
        'secure': False,
        'value': '5Y.k1NYFa0jVxcAfZya710GyNOswSINgrExzcBSh69V3b_3VvaJSG2Acij66UYQ2oH0JicC0V0LUSYYipkbwWtIy3qMMfBI4dj6T_5a4oFj1ROpXPdp2IMS2B2BACcbbDZOMIF2r0incao6q5gHO3dpmE8sLIsLuTdOBDcAcIpL_40_nUBbzFGi5H697kMIQqXy.Fk8l1gb8b2x_rMFtM4VgZnA6dJ8PrSLUFk.RjHLmAj2VCF8rVaJhCFqXirP1Kl'
    }
    edge.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": """
            Object.defineProperty(navigator, 'webdriver',{
            get: () => undefined
            })
            """
            })
    edge.get('http://43.248.49.97/')
    sleep(5)
    cookie_dict =  {
              'expiry': 1606620717,
              'httpOnly': False,
              'name': '9CKCOkIaqzqET',
              'path': '/',
              'secure': False,
              'value': '5Y.k1NYFa0jVxcAfZya710GyNOswSINgrExzcBSh69V3b_3VvaJSG2Acij66UYQ2oH0JicC0V0LUSYYipkbwWtIy3qMMfBI4dj6T_5a4oFj1ROpXPdp2IMS2B2BACcbbDZOMIF2r0incao6q5gHO3dpmE8sLIsLuTdOBDcAcIpL_40_nUBbzFGi5H697kMIQqXy.Fk8l1gb8b2x_rMFtM4VgZnA6dJ8PrSLUFk.RjHLmAj2VCF8rVaJhCFqXirP1Kl'
                    }
    edge.add_cookie(cookie_dict = cookie_dict)
    edge.implicitly_wait(100)
    sleep(2)
    edge.get(url_send)
    sleep(3)
    print('--启动-'*15)
    try:
        WebDriverWait(edge, timeout=40, poll_frequency=0.5, ignored_exceptions=None).until(
            EC.visibility_of_any_elements_located((By.XPATH, '''/html/body/div/table/tbody/tr[1]/td[1]/label''')))
        WebDriverWait(edge, timeout=330, poll_frequency=0.5, ignored_exceptions=None).until_not(
            EC.visibility_of_any_elements_located((By.XPATH, '''//*[@id="test"]''')))
    #    element = WebDriverWait(edge,timeout=180,poll_frequency=1.5,ignored_exceptions=None).until(
    #    edge.find_element(By.XPATH, '''//*[@id="div1"]/div/div/div/div[1]/p/span''').is_displayed())  #这个是错误的用法
        html_response = edge.page_source
        e = etree.HTML(html_response)
        data_date = e.xpath('''//tbody[@id = 'div1']/tr/td[1]/div/text()''')
        goods_code = e.xpath('''//tbody[@id = 'div1']/tr/td[2]/div/text()''')
        goods_name = e.xpath('''//tbody[@id = 'div1']/tr/td[3]/div/text()''')
        partner_code = e.xpath('''//tbody[@id = 'div1']/tr/td[4]/div/text()''')
        partner_name = e.xpath('''//tbody[@id = 'div1']/tr/td[5]/div/text()''')
        trade_code = e.xpath('''//tbody[@id = 'div1']/tr/td[6]/div/text()''')
        trade_name = e.xpath('''//tbody[@id = 'div1']/tr/td[7]/div/text()''')
        in_province_code = e.xpath('''//tbody[@id = 'div1']/tr/td[8]/div/text()''')
        in_province_name = e.xpath('''//tbody[@id = 'div1']/tr/td[9]/div/text()''')
        first_unit_num = e.xpath('''//tbody[@id = 'div1']/tr/td[10]/div/text()''')
        first_unit_name = e.xpath('''//tbody[@id = 'div1']/tr/td[11]/div/text()''')
        second_unit_num = e.xpath('''//tbody[@id = 'div1']/tr/td[12]/div/text()''')
        second_unit_name = e.xpath('''//tbody[@id = 'div1']/tr/td[13]/div/text()''')
        rmb_value = e.xpath('''//tbody[@id = 'div1']/tr/td[14]/div/text()''')
        all_info = {
            '数据年月': data_date,
            '商品编码': goods_code,
            '商品名称': goods_name,
            '贸易伙伴编码': partner_code,
            '贸易伙伴名称': partner_name,
            '贸易方式编码': trade_code,
            '贸易方式名称': trade_name,
            '注册地编码': in_province_code,
            '注册地名称': in_province_name,
            '第一数量': first_unit_num,
            '第一计量单位': first_unit_name,
            '第二数量': second_unit_num,
            '第二计量单位': second_unit_name,
            '人民币': rmb_value
        }
        outdata = pd.DataFrame(all_info)
        outdata.to_csv('{0}年{1}月{2}省市{3}国家进口数据.csv'.format(y, m, p, c), encoding='GBK')
        edge.quit()
        h += 1
        with open('爬取成功链接列表.txt', 'a') as f_success:
            f_success.write(url_send + '\n')
            f_success.close()
        print('*' * 100)
        lenth = len(data_date)
        print('成功', y, '\t', m, '\t', p, '\t', c,'\t','数据长度:',lenth)
        print('已经爬取:', h, '\t', '共计:', kill_num, '\t', '本次比例:', h / kill_num * 100, '%')
        print('*!' * 50)
    except:
        with open('爬取失败链接列表.txt','a') as f_fail:
            f_fail.write(url_send+'\n')
            f_fail.close()
            url_send = [url_send]
            request_url = url_send+request_url
            edge.quit()
            print('~' * 100)
            print('失败', y, '\t', m, '\t', p, '\t', c)
            print('~!' * 50)


if __name__ == '__main__':
    proxys =['http://61.135.169.121:80']# ['http://117.185.17.151:80']#['http://61.135.185.31:80']#['http://61.135.185.31:80']#['http://61.135.169.121:80']#['http://122.147.141.151:80']#['http://180.149.144.224:80']#['http://180.97.34.35:80'] # ['http://123.125.114.107:80']['http://180.97.34.35:80']#  ['http://119.147.210.236:3128']# #['http://122.147.141.151:80'] # ['http://101.132.143.232:80']# ['http://218.59.139.238:80'] #  ['http://43.254.221.27:80'] ['http://221.182.31.54:8080']
      #        ['http://124.236.111.11:80','http://140.143.142.200:1080','https://123.149.136.245:9999','https://115.221.246.157:9999',
      #        'https://115.221.244.206:9999','https://58.220.95.30:10174','https://175.42.128.5:9999','https://36.112.139.146:3128',
      #        'http://1.196.177.100:9999','https://110.243.16.93:9999']
    request_url = url_manger(year, month, province_code, country_t_code, url_base)
    with open('爬取成功链接列表.txt','r') as f_set_success:
        used_request_url = f_set_success.read()
        f_set_success.close()
    used_request_url_set = set(used_request_url.split('\n')) #转列表转集合
    request_url_set = set(request_url)
    request_url_set_end = request_url_set - used_request_url_set #去重
    request_url = list(request_url_set_end)
    kill_num = len(request_url)
    html_response = 'kkkk'

    h=0
    z_h = 0
    while True:
        w1 = threading.Thread(target=web_engine)
        w2 = threading.Thread(target=web_engine)
        w3 = threading.Thread(target=web_engine)
        w4 = threading.Thread(target=web_engine)
     #   w5 = threading.Thread(target=web_engine)
     #   w6 = threading.Thread(target=web_engine)
     #   w7 = threading.Thread(target=web_engine)
    #    w8 = threading.Thread(target=web_engine)
    #    w9 = threading.Thread(target=web_engine)
   #     w10 = threading.Thread(target=web_engine)

        w1.start()
        sleep(randint(30, 40))
     #   sleep(5)
        w2.start()
        sleep(randint(30, 40))
    #    sleep(5)
        w3.start()
        sleep(randint(30, 40))
    #    sleep(5)
        w4.start()
        w1.join()
        w3.join()
        print('睡眠ing')
        sleep(randint(30, 40))
        if h > kill_num:
            print('已经爬取:',h,'\t','共计:',kill_num,'\t','比例:',h/kill_num*100,'%')
            break
    print('爬虫完成')




标签:code,name,url,request,爬虫,id,海关,div,圣佛版
来源: https://blog.csdn.net/qq_42830971/article/details/109771313