擅长:python、mysql、java
<p>这是你想要的密码。但是,请停止使用regex解析HTML。漂亮的组合才是最好的选择。你知道吗</p>
<pre><code>import re
from urllib import urlopen
def readwebpage(url):
print "testing ",current
return urlopen(url).read()
url = 'http://xrisk.esy.es' #put starting url here
yet_to_visit= [url]
visited_urls = []
AmountVisited = 0
maxhits = 10
while (yet_to_visit and AmountVisited<maxhits):
print yet_to_visit
current = yet_to_visit.pop()
AmountVisited = AmountVisited + 1
html = readwebpage(current)
if html == None:
print "* bad reference to http", current
else:
r = re.compile('(?<=href=").*?(?=")')
links = re.findall(r,html) #Creates a list of all links in HTML code starting with...
for u in links:
if u in visited_urls:
continue
elif u.find('http')!=-1:
yet_to_visit.append(u)
print links
visited_urls.append(current)
</code></pre>