擅长:python、mysql、java
<p>您可以尝试以下方法:</p>
<pre><code>class APSpider(BaseSpider):
name = "APSpider"
start_urls = [
"http://www.somedomain.com/list-of-websites",
]
def __init__(self):
self.allowed_domains = None
def parse(self, response):
soup = BeautifulSoup( response.body )
if not self.allowed_domains:
for link_tag in soup.findAll('td',{'class':'half-width'}):
_website = link_tag.find('a')['href']
u = urlparse.urlparse(_website)
self.allowed_domains.append(u.netloc)
yield Request(url=_website, callback=self.parse_secondary_site)
if response.url in self.allowed_domains:
yield Request(...)
...
</code></pre>