擅长:python、mysql、java
<pre><code>import csv
from collections import namedtuple
import scrapy
def get_urls_from_csv():
with open('data.csv', newline='') as csv_file:
data = csv.reader(csv_file, delimiter=',')
for row in data:
yield row[2]
# if you can use something else than scrapy
rssitem = namedtuple('rssitem', 'sourceurl rssurl')
class RssparserSpider(scrapy.Spider):
name = "rssspider"
allowed_domains = ["*"]
start_urls = ()
def start_requests(self): # remember that it returns generator
for start_url in get_urls_from_csv():
yield scrapy.http.Request(url="http://{}".format(start_url))
def parse(self, response):
res = response.xpath('//link[@type="application/rss+xml"]/@href')
for sel in res:
yield rssitem(response.url, sel.extract())
pass
</code></pre>