Python HTTP请求

2024-09-30 01:30:06 发布

您现在位置:Python中文网/ 问答频道 /正文

我用了这个剧本

from twisted.internet import reactor, threads
from urlparse import urlparse
import httplib
import itertools


concurrent = 200
finished=itertools.count(1)
reactor.suggestThreadPoolSize(concurrent)

def getStatus(ourl):
    url = urlparse(ourl)
    conn = httplib.HTTPConnection(url.netloc)   
    conn.request("HEAD", url.path)
    res = conn.getresponse()
    return res.status

def processResponse(response,url):
    print response, url
    processedOne()

def processError(error,url):
    print "error", url#, error
    processedOne()

def processedOne():
    if finished.next()==added:
        reactor.stop()

def addTask(url):
    req = threads.deferToThread(getStatus, url)
    req.addCallback(processResponse, url)
    req.addErrback(processError, url)   

added=0
for url in open('urllist.txt'):
    added+=1
    addTask(url.strip())

try:
    reactor.run()
except keyboardInterrupt:
    reactor.stop()

当我尝试运行脚本$python时测试.py在

它只打印url不做cUrl或发送HTTP请求。。在

如何为每个进程发送HTTP或cURL进程

谢谢


Tags: fromimporturladdeddeferrorconnreq
2条回答

测试代码,使用inlineCallbacksdeferToThread。还使用defer.gatherResults来知道何时处理了所有延迟(而不是OP中的counter方法):

from twisted.internet import reactor, defer, utils
from twisted.internet.threads import deferToThread
from urlparse import urlparse
import httplib

threadDeferred = deferToThread.__get__

@threadDeferred
def get_url_head(url_arg):
  url = urlparse(url_arg)
  conn = httplib.HTTPConnection(url.netloc)   
  conn.request("HEAD", url.path)
  res = conn.getresponse()
  conn.close()
  return res.status

@defer.inlineCallbacks
def check_url(sem,url_arg):
  yield sem.acquire()
  try:
    result = yield get_url_head(url_arg)
    defer.returnValue(result)
  finally:
    sem.release()

@defer.inlineCallbacks
def run(reactor,SEMAPHORE_SIZE=10):
  sem = defer.DeferredSemaphore(SEMAPHORE_SIZE)
  deferreds = []
  failed_urls = []
  responded_urls = []
  with open('urllist.txt','r') as f:
    for line in f:
      url_arg = line.strip()
      d = check_url(sem,url_arg)
      d.addCallback(processResult,url_arg,responded_urls).addErrback(processErr,url_arg,failed_urls)
      deferreds.append(d)
  res = yield defer.gatherResults(deferreds)
  # Do something else with failed_urls and responded_urls
  reactor.callLater(0,reactor.stop)

def main():
  from twisted.internet import reactor
  reactor.callWhenRunning(run,reactor)
  reactor.run()

def processResult(result,url_arg,responded_urls):
  print "Reponse %s from %s" % (result,url_arg)
  responded_urls.append((url_arg,result))

def processErr(err,url_arg,failed_urls):
  print "Error checking %s: %s" % (url_arg,repr(err.value))
  failed_urls.append((url_arg,err.value))

if __name__ == '__main__':
  main()

但是,如果URL的格式不包含“http://”,那么这应该是可行的, 如果它们确实包含“http://”,则在注释中有相应的解决方案

import httplib

def requester(url):
    host = url.split('/')[0]
    #if urls do contain 'http://'  >  host = url.split('/')[2].replace('http://','')
    req = url[url.find(host)+len(host):]
    conn = httplib.HTTPConnection(host)
    conn.request("HEAD","/"+req)
    response = conn.getresponse()
    print response.status, response.reason

    #if you want data...
    #data = response.read()
    #print data

for url in open(urls.txt):
    try:
        requester(url)
    except Error,e:
        print Error, e

此外,我再次检查了httplib

相关问题 更多 >

    热门问题