Python首先获取一个标签解析HTML

# Created by Spencer Fontein on 5/28/14. # Copyright (c) 2014 Spencer Fontein. All rights reserved. # coding: utf-8 import pprint from lxml import etree import cgi from bs4 import BeautifulSoup import datetime import urllib2 import cookielib import re #where to send the file at the end output_path = ""#"/home/spencerf/public_html/rpi/" def Get_website_text(url): # url for website base_url = url # file for storing cookies cookie_file = 'mfp.cookies' # set up a cookie jar to store cookies cj = cookielib.MozillaCookieJar(cookie_file) # set up opener to handle cookies, redirects etc opener = urllib2.build_opener( urllib2.HTTPRedirectHandler(), urllib2.HTTPHandler(debuglevel=0), urllib2.HTTPSHandler(debuglevel=0), urllib2.HTTPCookieProcessor(cj) ) # pretend we're a web browser and not a python script opener.addheaders = [('User-agent', ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) ' 'AppleWebKit/535.1 (KHTML, like Gecko) ' 'Chrome/13.0.782.13 Safari/535.1')) ] # open the front page of the website to set # and save initial cookies response = opener.open(base_url) web_text = response.read() response.close() return web_text #get union menus def getUnionMenuUrls(soup): monthly_urls = soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True) #print soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True) #print soup.find(text=re.compile('9/22/2014 - 9/28/2014')) menu_urls = [] url = "https://rpi.sodexomyway.com" for tag in monthly_urls: if ".htm" in tag['href']: name = str(tag.text) name = name.replace("Click ",'').replace('For ','').replace('Menu ','').replace('of ','').replace('Week ','').replace('Here ','').replace('Of ','') name = name.replace('January ','').replace('February ','').replace('March ','').replace('April ','').replace('May ','') name = name.replace('June ','').replace('July ','').replace('August ','').replace('September ','') name = name.replace('October ','').replace('November ','').replace('December ','') name = name.replace('1','').replace("2", '').replace("3", '').replace("4", '') name = name.replace('5','').replace("6", '').replace("7", '').replace("8", '') name = name.replace('9','').replace("0", '').replace('-','') name = name.replace('\n','rpi_russell_sage_menu').replace('/','') name = name.replace('!','').replace(', ','').replace(' ','').replace('College','') newurl = url + tag['href'] menu_urls.append([name,newurl]) return menu_urls def get_xml(url): tag_stack = [] output_lines = [] html = urllib2.urlopen(url).read().replace(' ',"") xml = etree.HTML(html) open_tag(tag_stack, output_lines, "menu", "") days = xml.xpath('//td[@class="dayouter"]') # make the xml for each day for day in days: day_name = day.xpath('./a/@name')[0] safe_open_tag(tag_stack, output_lines, "day", "menu", day_name) dayinner_trs = day.xpath('.//table[@class="dayinner"]//tr') for dayinner_tr in dayinner_trs: # change meal if (dayinner_tr.xpath('./td[@class="mealname"]')): meal_name = dayinner_tr.xpath('./td[@class="mealname"]/text()')[0] safe_open_tag(tag_stack, output_lines, "meal", "day", meal_name) # change counter if (dayinner_tr.xpath('./td[@class="station"]/text()')): counter_name = dayinner_tr.xpath('./td[@class="station"]/text()')[0] safe_open_tag(tag_stack, output_lines, "counter", "meal", counter_name) # change dish if (dayinner_tr.xpath('./td[@class="menuitem"]')): item_name = "".join(dayinner_tr.xpath('./td[@class="menuitem"]/div//text()')).strip() safe_open_tag(tag_stack, output_lines, "dish", "counter", "") output_lines.append("<name>%s</name>" % cgi.escape(item_name)) close_tags(tag_stack, output_lines, "") output_string = '\n'.join([line.encode('utf-8') for line in output_lines]) return output_string # close the tags up to the parent of last tag in tag_stack def close_tags(tag_stack, output_lines, parent_tag): while tag_stack and tag_stack[-1] != parent_tag: top = tag_stack.pop() output_lines.append(' ' * len(tag_stack) + '</%s>' % top) # open the new_tag using the suitable style based on name_property def open_tag(tag_stack, output_lines, new_tag, name_property): if name_property: output_lines.append(' ' * len(tag_stack) + '<%s name="%s">' % (new_tag, name_property)) else: output_lines.append(' ' * len(tag_stack) + '<%s>' % new_tag) tag_stack.append(new_tag) # check if the new_tag parent is in the stack, if not it'll add the parent def safe_open_tag(tag_stack, output_lines, new_tag, parent_tag, name_property): if parent_tag not in tag_stack: output_lines.append(' ' * len(tag_stack) + '<%s>' % parent_tag) tag_stack.append(parent_tag) else: close_tags(tag_stack, output_lines, parent_tag) open_tag(tag_stack, output_lines, new_tag, name_property) # sample use of get_xml function # In[17]: if __name__ == "__main__": base_url_u = "https://rpi.sodexomyway.com/dining-choices/res/sage.html" htmltext_u = Get_website_text(base_url_u) soup_u = BeautifulSoup(htmltext_u) menu_url_list = getUnionMenuUrls(soup_u) for menu in menu_url_list: if '.htm' in menu[1]: ofname = str(menu[0].replace(" ","A")) + ".xml" output_file = output_path + ofname open(output_file, "w").write(get_xml(menu[1])) else: print menu[0],":",menu[1], "is not valid html."

date function def getCurrentWeekMenu(date1,date2): now = datetime.datetime.now() monthstr = "January,February,March,April,May,June,July,August,September,October,November,December" months = monthstr.split(',') d = dict(zip(months,range(1,13))) menu_1_month = d[str(date1[0])] menu_2_month = d[str(date2[0])] menu_1_day = str(date1[1][:-2]) menu_2_day = str(date2[1][:-2]) if menu_1_day > menu_2_day: if now.day >= menu_1_day: menu = 1 else: menu = 2 else: if now.day >= menu_2_day: menu = 2 elif now.month > menu_1_month: menu = 2 else: menu = 1 return menu-1

1条回答

网友

1楼 · 发布于 2024-09-29 21:41:26

我运行你的代码没有问题

from BeautifulSoup import BeautifulSoup
import requests
response = requests.get('https://rpi.sodexomyway.com/dining-choices/res/sage.html')
soup = BeautifulSoup(response.text)
#output of your code
print soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)

>>> [<a href="#">On the Menu</a>,
     <a href="/images/WeeklyMenuRSDH%209-22-14_tcm1068-29436.htm" target="_blank">
                     9/22/2014 - 9/28/2014</a>,
     <a href="/images/WeeklyMenuRSDH%209-29-14_tcm1068-29441.htm" target="_blank">
                     9/29/2014 - 10/5/2014</a>,
     <a href="#">Hours of Operation</a>]

# now get the href
url = dict(soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)[1].attrs)['href']
# output
u'/images/WeeklyMenuRSDH%209-22-14_tcm1068-29436.htm'

回答问题的第二部分

^{pr2}$

更新-添加当前周筛选器

def getUnionMenuUrls(soup):                                                      
    monthly_urls = soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)[1:3] # cut extra links
    today = datetime.datetime.today() # get todays date                          
    url = "https://rpi.sodexomyway.com"                                          
    for tag in monthly_urls:                                                      
        if ".htm" in tag['href']:                                                
            name = str(tag.text)                                                 
            datestrings = name.split(' - ') # split string and get the list of dates
            date_range = [datetime.datetime.strptime(d, '%m/%d/%Y') for d in datestrings] # convert datestrings to datetime objects
            if date_range[0] <= today <= date_range[1]: # check if today in that range
                return url + tag['href']

相关问题更多 >

编程相关推荐

热门问题

热门文章