擅长:python、mysql、java
<pre><code>import httplib
from lxml import html
#CONNECTION
url = "www.darlighting.co.uk"
path = "/"
conn = httplib.HTTPConnection(url)
conn.putrequest("GET", path)
#HERE YOU HEADERS...
header = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64)", "Cache-Control": "no-cache"}
for k, v in header.iteritems():
conn.putheader(k, v)
conn.endheaders()
res = conn.getresponse()
if res.status == 200:
source = res.read()
else:
print res.status
print res.getheaders()
#EXTRACT
dochtml = html.fromstring(source)
for elem, att, link, pos in dochtml.iterlinks():
if att == 'src': #or 'href'
print 'elem: {0} || pos {1}: || attr: {2} || link: {3}'.format(elem, pos, att, link)
</code></pre>