使用aiohttp实现爬虫
作者:互联网
import asyncio import re import aiohttp import aiomysql from pyquery import PyQuery stopping = False start_url = "http://www.jobbole.com/" waitting_urls = [] seen_urls = set() sem = asyncio.Semaphore(3) async def fetch(url, session): async with sem: await asyncio.sleep(1) try: async with session.get(url) as response: print("Status:", response.status) print("Content-type:", response.headers['content-type']) if response.status in [200, 201]: content = await response.text() return content except Exception as e: print(e) def extract_urls(html): pq = PyQuery(html) for link in pq.items("a"): url = link.attr("href") if url and url.startswith("/caijing") and url not in seen_urls: waitting_urls.append("http://www.jobbole.com{}".format(url)) async def init_urls(url, session): html = await fetch(url, session) seen_urls.add(url) extract_urls(html) async def article_handler(url, session, pool): # 获取文章详情并解析入库 html = await fetch(url, session) seen_urls.add(url) extract_urls(html) pq = PyQuery(html) title = pq("title").text() async with pool.acquire() as conn: async with conn.cursor() as cur: insert_sql = "insert into artest(title) values ('{}')".format(title) await cur.execute(insert_sql) async def consumer(pool): async with aiohttp.ClientSession() as session: while not stopping: if len(waitting_urls) == 0: await asyncio.sleep(0.5) continue url = waitting_urls.pop() print("start get url: {}".format(url)) if re.match("http://.*?jobbole.com/.*\.html", url): if url not in seen_urls: asyncio.ensure_future(article_handler(url, session, pool)) await asyncio.sleep(0.5) else: if url not in seen_urls: asyncio.ensure_future(init_urls(url, session)) async def main(loop): pool = await aiomysql.create_pool(host='', port=, user='', password='', db='', loop=loop, charset="utf8", autocommit=True) async with aiohttp.ClientSession() as session: html = await fetch(start_url, session) seen_urls.add(start_url) extract_urls(html) asyncio.ensure_future(consumer(pool)) if __name__ == "__main__": loop = asyncio.get_event_loop() asyncio.ensure_future(main(loop)) loop.run_forever()
标签:aiohttp,实现,爬虫,url,session,urls,async,html,asyncio 来源: https://www.cnblogs.com/yejing-snake/p/14276613.html