其他分享
首页 > 其他分享> > 爬取知H热门保存为csv

爬取知H热门保存为csv

作者:互联网

requests_(sava_csv_list)zhihu_com_collection_hot.py

#!/usr/bin/env python3
# coding=utf-8
# Version:python3.6.1
# File:zhihu_com_collection_hot.py
# Author:LGSP_Harold
import csv
import os

import requests
from lxml import etree


file_dir = './files/zhihu_com_collection_hot'
if not os.path.exists(file_dir):
    os.makedirs(file_dir)

url = 'https://www.zhihu.com/collection/hot'
headers = {
    'authority': 'www.zhihu.com',
    'method': 'GET',
    'path': '/collection/hot',
    'scheme': 'https',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cache-control': 'no-cache',
    'dnt': '1',
    'pragma': 'no-cache',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}

response = requests.get(url=url, headers=headers)
# print(response.text)
html_obj = etree.HTML(response.text)
items = html_obj.xpath('//div[@class="CollectionListCard CollectionHotListPage-collectionCard"]')

file_path = file_dir + '/zhihu_com_collection_hot.csv'
with open(file=file_path, mode='w', encoding='utf-8') as file:
    # writer = csv.writer(file)
    writer = csv.writer(file, delimiter='`')   # 默认以,分隔,delimiter可改变分隔符
    writer.writerow(['question', 'link', 'author', 'answer'])

    data = []

    for item in items:
        info = []
        author = item.xpath('.//span[@class="CollectionListCard-creatorName"]/text()')[0]
        question = item.xpath('.//a[@class="CollectionListCard-contentTitle"]/text()')[0]
        link = item.xpath('.//a[@class="CollectionListCard-contentTitle"]/@href')[0]
        answer = item.xpath('.//div[@class="CollectionListCard-contentExcerpt"]/text()')[0]

        info = [question, link, author, answer]
        data.append(info)

        # writer.writerow([question, link, author, answer])
    writer.writerows(data)

 

标签:zhihu,取知,text,writer,热门,collection,hot,file,csv
来源: https://www.cnblogs.com/Harold-Hua/p/15157175.html