首页 > 其他分享> > 爬虫获取页面信息并存储

爬虫获取页面信息并存储

2021-12-05 23:02:11 作者：互联网

目标：获取CSDN首页直播内容
实现获取内容的类 Crowller

获取内容: 通过 superagent插件获取页面信息
存储内容：fs.writeFileSync

根据以上两点实现的类：

import fs from 'fs';
import path from 'path';
import superagent from 'superagent';
import DellAnalyzer from './dellAnalyzer';

export interface Analyzer {
  analyze: (html: string, filePath: string) => string;
}

class Crowller {
  private filePath = path.resolve(__dirname, '../data/live.json'); // 存储的文件位置

  // 获取页面信息
  private async getRawHtml() {
    const result = await superagent.get(this.url);
    return result.text;
  }

  // 将爬取到的内容写入文件
  private writeFile(content: string) {
    fs.writeFileSync(this.filePath, content);
  }

  // 初始化
  private async initSpiderProcess() {
    const html = await this.getRawHtml();
    const fileContent = this.analyzer.analyze(html, this.filePath); // 处理需要获取的页面内容
    this.writeFile(fileContent);
  }

  
  constructor(private url: string, private analyzer: Analyzer) {
    this.initSpiderProcess();
  }
}

const url = `https://www.csdn.net/?spm=3001.4476`; // 获取csdn首页直播信息

const analyzer = DellAnalyzer.getInstance(); // 单例模式获取
new Crowller(url, analyzer); // 创建一个实例执行

处理网页内容的类
通过cheerio 插件，以类似jQuery的方式获取节点信息，以储存需要的内容

import fs from 'fs';
import cheerio from 'cheerio'; // 获取页面内容的工具
import { Analyzer } from './crowller';

interface Live {
  title: string;
  time: string;
}

interface LiveResult {
  time: number;
  data: Live[];
}

interface Content {
  [propName: number]: Live[];
}

export default class DellAnalyzer implements Analyzer {
  private static instance: DellAnalyzer;

  static getInstance() {
    if (!DellAnalyzer.instance) {
      DellAnalyzer.instance = new DellAnalyzer();
    }
    return DellAnalyzer.instance;
  }

  // 获取直播名字跟时间
  private getLiveInfo(html: string) {
    const $ = cheerio.load(html);
    const courseItems = $('.www_live_item');
    const courseInfos: Live[] = [];
    courseItems.map((index, element) => {
      const title = $(element).find('h3').text();
      const time = $(element).find('.text').text();
      courseInfos.push({ title, time });
    });
    return {
      time: new Date().getTime(),
      data: courseInfos,
    };
  }

  // 处理需要存储的内容
  private generateJsonContent(liveInfo: LiveResult, filePath: string) {
    let fileContent: Content = {};
    // 文件存在则追加内容
    if (fs.existsSync(filePath)) {
      fileContent = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
    }
    fileContent[liveInfo.time] = liveInfo.data;
    return fileContent;
  }

  public analyze(html: string, filePath: string) {
    const liveInfo = this.getLiveInfo(html);
    const fileContent = this.generateJsonContent(liveInfo, filePath);
    return JSON.stringify(fileContent);
  }

  private constructor() {}
}

package.json配置信息
tsc : 将ts文件转换成js文件
nodemon: 自动检测到目录中的文件更改时通过重新启动
concurrently：同时执行操作命令