我试图用BeautifulSoup和Requests搜索一个网站。我面临的问题是,我得到一些结果回来,然后程序无限期地挂起。然后,我尝试使用timeout
函数,它返回了一个错误。以下是我使用的代码:
import requests
from bs4 import BeautifulSoup
all_data=[]
for i in range(1950,2020):
x = requests.get("https://indiankanoon.org/browse/supremecourt/%s/"%(i), timeout = 5)
soup = BeautifulSoup(x.content, 'html.parser')
data = [x["href"] for x in soup.find_all('a',{'href': re.compile(r'/search/')})]
all_data.append(data)
我试过在requests.get()
中使用headers
,但没有解决这个问题。下面是我得到的错误:
----------------------
1950
-----------------------
['/search/?formInput=doctypes:supremecourt fromdate:1-1-1950 todate: 31-1-1950', '/search/?formInput=doctypes:supremecourt fromdate:1-2-1950 todate: 28-2-1950', '/search/?formInput=doctypes:supremecourt fromdate:1-3-1950 todate: 31-3-1950', '/search/?formInput=doctypes:supremecourt fromdate:1-4-1950 todate: 30-4-1950', '/search/?formInput=doctypes:supremecourt fromdate:1-5-1950 todate: 31-5-1950', '/search/?formInput=doctypes:supremecourt fromdate:1-6-1950 todate: 30-6-1950', '/search/?formInput=doctypes:supremecourt fromdate:1-7-1950 todate: 31-7-1950', '/search/?formInput=doctypes:supremecourt fromdate:1-8-1950 todate: 31-8-1950', '/search/?formInput=doctypes:supremecourt fromdate:1-9-1950 todate: 30-9-1950', '/search/?formInput=doctypes:supremecourt fromdate:1-10-1950 todate: 31-10-1950', '/search/?formInput=doctypes:supremecourt fromdate:1-11-1950 todate: 30-11-1950', '/search/?formInput=doctypes:supremecourt fromdate:1-12-1950 todate: 31-12-1950']
----------------------
1951
-----------------------
['/search/?formInput=doctypes:supremecourt fromdate:1-1-1951 todate: 31-1-1951', '/search/?formInput=doctypes:supremecourt fromdate:1-2-1951 todate: 28-2-1951', '/search/?formInput=doctypes:supremecourt fromdate:1-3-1951 todate: 31-3-1951', '/search/?formInput=doctypes:supremecourt fromdate:1-4-1951 todate: 30-4-1951', '/search/?formInput=doctypes:supremecourt fromdate:1-5-1951 todate: 31-5-1951', '/search/?formInput=doctypes:supremecourt fromdate:1-6-1951 todate: 30-6-1951', '/search/?formInput=doctypes:supremecourt fromdate:1-7-1951 todate: 31-7-1951', '/search/?formInput=doctypes:supremecourt fromdate:1-8-1951 todate: 31-8-1951', '/search/?formInput=doctypes:supremecourt fromdate:1-9-1951 todate: 30-9-1951', '/search/?formInput=doctypes:supremecourt fromdate:1-10-1951 todate: 31-10-1951', '/search/?formInput=doctypes:supremecourt fromdate:1-11-1951 todate: 30-11-1951', '/search/?formInput=doctypes:supremecourt fromdate:1-12-1951 todate: 31-12-1951']
----------------------
1952
-----------------------
['/search/?formInput=doctypes:supremecourt fromdate:1-1-1952 todate: 31-1-1952', '/search/?formInput=doctypes:supremecourt fromdate:1-2-1952 todate: 29-2-1952', '/search/?formInput=doctypes:supremecourt fromdate:1-3-1952 todate: 31-3-1952', '/search/?formInput=doctypes:supremecourt fromdate:1-4-1952 todate: 30-4-1952', '/search/?formInput=doctypes:supremecourt fromdate:1-5-1952 todate: 31-5-1952', '/search/?formInput=doctypes:supremecourt fromdate:1-6-1952 todate: 30-6-1952', '/search/?formInput=doctypes:supremecourt fromdate:1-7-1952 todate: 31-7-1952', '/search/?formInput=doctypes:supremecourt fromdate:1-8-1952 todate: 31-8-1952', '/search/?formInput=doctypes:supremecourt fromdate:1-9-1952 todate: 30-9-1952', '/search/?formInput=doctypes:supremecourt fromdate:1-10-1952 todate: 31-10-1952', '/search/?formInput=doctypes:supremecourt fromdate:1-11-1952 todate: 30-11-1952', '/search/?formInput=doctypes:supremecourt fromdate:1-12-1952 todate: 31-12-1952']
----------------------
1953
-----------------------
---------------------------------------------------------------------------
timeout Traceback (most recent call last)
~/.pyenv/versions/3.7.3/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
375 try:
--> 376 self._validate_conn(conn)
377 except (SocketTimeout, BaseSSLError) as e:
~/.pyenv/versions/3.7.3/lib/python3.7/site-packages/urllib3/connectionpool.py in _validate_conn(self, conn)
993 if not getattr(conn, "sock", None): # AppEngine might not have `.sock`
--> 994 conn.connect()
995
~/.pyenv/versions/3.7.3/lib/python3.7/site-packages/urllib3/connection.py in connect(self)
393 server_hostname=server_hostname,
--> 394 ssl_context=context,
395 )
~/.pyenv/versions/3.7.3/lib/python3.7/site-packages/urllib3/util/ssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir, key_password)
369 if HAS_SNI and server_hostname is not None:
--> 370 return context.wrap_socket(sock, server_hostname=server_hostname)
371
~/.pyenv/versions/3.7.3/lib/python3.7/ssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session)
411 context=self,
--> 412 session=session
413 )
~/.pyenv/versions/3.7.3/lib/python3.7/ssl.py in _create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session)
852 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
--> 853 self.do_handshake()
854 except (OSError, ValueError):
~/.pyenv/versions/3.7.3/lib/python3.7/ssl.py in do_handshake(self, block)
1116 self.settimeout(None)
-> 1117 self._sslobj.do_handshake()
1118 finally:
timeout: _ssl.c:1039: The handshake operation timed out
During handling of the above exception, another exception occurred:
ReadTimeoutError Traceback (most recent call last)
~/.pyenv/versions/3.7.3/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
448 retries=self.max_retries,
--> 449 timeout=timeout
450 )
~/.pyenv/versions/3.7.3/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
719 retries = retries.increment(
--> 720 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
721 )
~/.pyenv/versions/3.7.3/lib/python3.7/site-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
399 if read is False or not self._is_method_retryable(method):
--> 400 raise six.reraise(type(error), error, _stacktrace)
401 elif read is not None:
~/.pyenv/versions/3.7.3/lib/python3.7/site-packages/urllib3/packages/six.py in reraise(tp, value, tb)
734 raise value.with_traceback(tb)
--> 735 raise value
736 finally:
~/.pyenv/versions/3.7.3/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
671 headers=headers,
--> 672 chunked=chunked,
673 )
~/.pyenv/versions/3.7.3/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
378 # Py2 raises this as a BaseSSLError, Py3 raises it as socket timeout.
--> 379 self._raise_timeout(err=e, url=url, timeout_value=conn.timeout)
380 raise
~/.pyenv/versions/3.7.3/lib/python3.7/site-packages/urllib3/connectionpool.py in _raise_timeout(self, err, url, timeout_value)
330 raise ReadTimeoutError(
--> 331 self, url, "Read timed out. (read timeout=%s)" % timeout_value
332 )
ReadTimeoutError: HTTPSConnectionPool(host='indiankanoon.org', port=443): Read timed out. (read timeout=5)
During handling of the above exception, another exception occurred:
ReadTimeout Traceback (most recent call last)
<ipython-input-7-ef31c3cbb243> in <module>
5 print('-----------------------')
6 # , headers = headers,timeout = 60headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
----> 7 x = requests.get("https://indiankanoon.org/browse/supremecourt/%s/"%(i), timeout = 5)
8 soup = BeautifulSoup(x.content, 'html.parser')
9 data = [x["href"] for x in soup.find_all('a',{'href': re.compile(r'/search/')})]
~/.pyenv/versions/3.7.3/lib/python3.7/site-packages/requests/api.py in get(url, params, **kwargs)
73
74 kwargs.setdefault('allow_redirects', True)
---> 75 return request('get', url, params=params, **kwargs)
76
77
~/.pyenv/versions/3.7.3/lib/python3.7/site-packages/requests/api.py in request(method, url, **kwargs)
58 # cases, and look like a memory leak in others.
59 with sessions.Session() as session:
---> 60 return session.request(method=method, url=url, **kwargs)
61
62
~/.pyenv/versions/3.7.3/lib/python3.7/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
531 }
532 send_kwargs.update(settings)
--> 533 resp = self.send(prep, **send_kwargs)
534
535 return resp
~/.pyenv/versions/3.7.3/lib/python3.7/site-packages/requests/sessions.py in send(self, request, **kwargs)
644
645 # Send the request
--> 646 r = adapter.send(request, **kwargs)
647
648 # Total elapsed time of the request (approximately)
~/.pyenv/versions/3.7.3/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
527 raise SSLError(e, request=request)
528 elif isinstance(e, ReadTimeoutError):
--> 529 raise ReadTimeout(e, request=request)
530 else:
531 raise
ReadTimeout: HTTPSConnectionPool(host='indiankanoon.org', port=443): Read timed out. (read timeout=5)
正如我们所看到的,它正确地返回了一些结果(本例中是其中的3个),但随后返回了错误。我试过把timeout
增加到60
,但没有效果。有人能告诉我怎么解决吗?你知道吗
在请求之间尝试
time.sleep(2)
,似乎确实解决了这个问题或者可以使用Try..exception阻止。如果超时错误来了,它会打印出来,然后移到明年继续循环。到避免证书问题我使用了
verify=False
这是密码。你知道吗
这里是控制台上的输出。你知道吗
等等。。。。你知道吗
相关问题 更多 >
编程相关推荐