asycnio异步采集小试一下 ,果然快!
作者:互联网
# _*_ coding: utf-8 _*_ import codecs from bs4 import BeautifulSoup import time, json, math import sys, os import asyncio import aiohttp import aiofiles f = codecs.open('goods.txt', 'w', encoding='utf-8', errors='ignore') semaphore = asyncio.Semaphore(5) #asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) async def getHtml(url): async with semaphore: async with aiohttp.ClientSession() as session: async with session.get(url) as html: if url.endswith('.jpg'): img = await html.read() imgname = url.replace('http://www.13qh.com/', '') imgpath = os.path.dirname(imgname) if not os.path.exists(imgpath): os.makedirs(imgpath) fp = await aiofiles.open(imgname, 'wb') await fp.write(img) return True else: tmp = await html.text(encoding='utf-8') return tmp async def getList(url, **cat): tmp = await getHtml(url) try: htm = BeautifulSoup(tmp, 'lxml') ul = htm.select('.goods-item .goods-pic a') except Exception as e: print(e) ul = None if ul != None: for li in ul: link = li.get('href') await parse(link, **cat) async def parse(url, **cat): tmp = await getHtml(url) try: htm = BeautifulSoup(tmp, 'lxml') goods_id = url.split('/')[-1] goods_name = htm.select('.goods-title h3')[0].text goods_name_sub = htm.select('.goods-title p')[0].text goods_price = htm.select('.goods-info .sale_price')[0].text sale_price = htm.select('.goods-info ul li')[0].find('del').text sale_price = filter(lambda ch : ch in '.0123456789', sale_price) thumb_cont = htm.select('.thumb-cont ul li') print(goods_name) goods_thumb = [] for thumb in thumb_cont: img = thumb.find('img').get('big') goods_thumb.append(img) print(img) await getHtml('http://www.13qh.com' + img) detail_div = htm.select('.detail-content p img') goods_detail = [] for p in detail_div: goods_detail.append(p.get('src')) print(p.get('src')) await getHtml('http://www.13qh.com' + p.get('src')) goods = { 'cat_id': cat['lan_id'], 'sub_id': cat['sub_id'], 'goods_id': goods_id, 'goods_name': goods_name, 'goods_price': goods_price, 'sale_price' : sale_price, 'goods_thumb': goods_thumb, 'goods_detail': goods_detail } f.write(json.dumps(goods) + os.linesep) except Exception as e: print(e) async def caiz(): url = 'http://www.13qh.com/' tmp = await getHtml(url) htm = BeautifulSoup(tmp, 'lxml') cat = htm.select('.category-content>ul>li') category = [] for li in cat: lan = li.select('p a')[0] lan_text = lan.text lan_id = lan.get('href').split('/')[-1] category.append({'cat_id': lan_id, 'cat_name': lan_text, 'parent_id': 0}) ul = li.select('.category-list ul li') for u in ul: ua = u.select('.a') for a in ua: sua = a.select('a') sua_text = sua.text sua_id = sua.get('href').split('/')[-1] category.append({'cat_id': sua_id, 'cat_name': sua_text, 'parent_id': lan_id}) ub = u.select('.b a')[0] sub_text = ub.text sub_id = ub.get('href').split('/')[-1] category.append({'cat_id': sub_id, 'cat_name': sub_text, 'parent_id': lan_id}) uc = u.select('.c a') for c in uc: suc_text = c.text suc_href = c.get('href') suc_id = suc_href.split('/')[-1] category.append({'cat_id': suc_id, 'cat_name': suc_text, 'parent_id': sub_id}) for i in range(1, 20): asyncio.ensure_future(getList("%s/page/%s" % (suc_href, i), lan_id = lan_id, sub_id = sub_id)) with codecs.open('category.txt', 'w', encoding='utf-8', errors='ignore') as ff: ff.write(json.dumps(category)) def main(): loop = asyncio.get_event_loop() asyncio.run(caiz()) f.close() if __name__ == '__main__': main()
标签:异步,lan,goods,text,cat,试一下,asycnio,id,select 来源: https://www.cnblogs.com/6min/p/14078916.html