垃圾登录不工作

from scrapy.http import Request, FormRequest from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from scrapy.selector import HtmlXPathSelector from scrapy.http import Request class TestSpider(CrawlSpider): name = 'testspider' login_page = 'http://145.100.108.148/login2/login.php' start_urls = ['http://145.100.108.148/login2/index.php' ] rules = ( Rule(LinkExtractor(allow=r'.*'), callback='parse_item', follow=True), ) login_user = 'test@hotmail.com' login_pass = 'test' def start_request(self): """This function is called before crawling starts""" return [Request(url=self.login_page, callback=self.login)] def login(self, response): """Generate a login request""" return FormRequest.from_response(response, formdata={ 'email': self.login_user, 'pass': self.login_pass}, callback=self.check_login_response) def check_login_response(self, response): """Check the response returned by a login request to see if we are successfully logged in""" if b"Dashboard" in response.body: self.logger.info("successfully logged in. Let's start crawling!") return self.initialized() else: self.logger.info("NOT LOGGED IN :(") # Something went wrong, we couldn't log in, so nothing happens. return def parse_item(self, response): """Save pages to disk""" self.logger.info('Hi, this is an item page! %s', response.url) page = response.url.split("/")[-2] filename = 'scraped-%s.html' % page with open(filename, 'wb') as f: f.write(response.body) self.log('Saved file %s' % filename)

2条回答

网友

1楼 · 编辑于 2024-09-28 22:25:03

CrawlSpider从Spider继承，init_request在从{}继承时有效。所以你需要在下面换衣服

def init_request(self):
    """This function is called before crawling starts"""
    return Request(url=self.login_page, callback=self.login)

到

^{pr2}$

接下来，response.body中得到的响应将是字节。所以你需要改变

if "Dashboard" in response.body:

到

if b"Dashboard" in response.body:

网友

2楼 · 编辑于 2024-09-28 22:25:03

感谢@Tarun Lalwani和一些尝试和错误，结果如下：

from scrapy.http import Request, FormRequest
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.http import FormRequest

class LoginSpider(CrawlSpider):
    name = 'loginspider'
    login_page = 'http://145.100.108.148/login2/login.php'
    start_urls = ['http://145.100.108.148/login2/index.php']
    username = 'test@hotmail.com'
    password = 'test'

    def init_request(self):
        return Request(url=self.login_page, callback=self.start_requests)

    def start_requests(self):
        print ("\n start_request is here \n")
        yield Request(
        url = self.login_page,
        callback = self.login,
        dont_filter = True
        )

    def login(self, response):
        print ("\n Login is here! \n")
        return FormRequest.from_response(response,
        formdata={  'email': self.username,
                    'pass': self.password},
        callback=self.check_login_response)

    def check_login_response(self, response):
        print ("\n Check_login_response \n")
        if b"Learn" in response.body:
            print("Worked, logged in")
            #return self.parse_item
        else:
            print("Not logged in")
            return

相关问题更多 >

编程相关推荐

热门问题

热门文章