如何编写python scrapy代码来提取站点站点地图中的url
作者:互联网
我正在尝试使用此代码获取站点地图中的网址列表.当我运行这个,我看到屏幕上没有结果.任何人都可以告诉我这个问题是什么,或者建议我用一个很好的例子.提前致谢
class MySpider(SitemapSpider):
name = "xyz"
allowed_domains = ["xyz.nl"]
sitemap_urls = ["http://www.xyz.nl/sitemap.xml"]
def parse(self, response):
print response.url
return Request(response.url, callback=self.parse_sitemap_url)
def parse_sitemap_url(self, response):
# do stuff with your sitemap links
解决方法:
此蜘蛛将获取站点地图中的所有URL并将其保存到列表中.您可以轻松地将其更改为输出到文件或控制台.
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import SitemapSpider
from scrapy.spiders import Spider
from scrapy.http import Request, XmlResponse
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
from scrapy.utils.gz import gunzip, is_gzipped
import re
import requests
class GetpagesfromsitemapSpider(SitemapSpider):
name = "test"
handle_httpstatus_list = [404]
def parse(self, response):
print response.url
def _parse_sitemap(self, response):
if response.url.endswith('/robots.txt'):
for url in sitemap_urls_from_robots(response.body):
yield Request(url, callback=self._parse_sitemap)
else:
body = self._get_sitemap_body(response)
if body is None:
self.logger.info('Ignoring invalid sitemap: %s', response.url)
return
s = Sitemap(body)
sites = []
if s.type == 'sitemapindex':
for loc in iterloc(s, self.sitemap_alternate_links):
if any(x.search(loc) for x in self._follow):
yield Request(loc, callback=self._parse_sitemap)
elif s.type == 'urlset':
for loc in iterloc(s):
for r, c in self._cbs:
if r.search(loc):
sites.append(loc)
break
print sites
def __init__(self, spider=None, *a, **kw):
super(GetpagesfromsitemapSpider, self).__init__(*a, **kw)
self.spider = spider
l = []
url = "https://channelstore.roku.com"
resp = requests.head(url + "/sitemap.xml")
if (resp.status_code != 404):
l.append(resp.url)
else:
resp = requests.head(url + "/robots.txt")
if (resp.status_code == 200):
l.append(resp.url)
self.sitemap_urls = l
print self.sitemap_urls
def iterloc(it, alt=False):
for d in it:
yield d['loc']
# Also consider alternate URLs (xhtml:link rel="alternate")
if alt and 'alternate' in d:
for l in d['alternate']:
yield l
标签:sitemap,python,scrapy,web-crawler 来源: https://codeday.me/bug/20191008/1874013.html