从记录列表中筛选单元：BS4 Python

# -*- coding utf-8 -*- from selenium.webdriver.firefox.options import Options from selenium import webdriver import time import os import shutil from bs4 import BeautifulSoup import uuid import csv import dateutil.parser as parser import pandas as pd from selenium.webdriver.support.select import Select class crawlOcean(): def __init__(self): print("hurray33") global downloadDir global uFileName global filname downloadDir = "" uFileName = str(uuid.uuid4()) filname = downloadDir + uFileName + ".csv" pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) fp = webdriver.FirefoxProfile() fp.set_preference("browser.download.folderList", 2) fp.set_preference("browser.download.manager.showWhenStarting", False) fp.set_preference("browser.download.dir", downloadDir) fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "attachment/csv") options = Options() options.add_argument("--headless") self.driver = webdriver.Firefox(firefox_profile=fp) #self.driver = webdriver.Firefox() print("hurray") self.driver.implicitly_wait(15) self.driver.get("http://www.epa.ie/hydronet/#Water%20Levels") self.verificationErrors = [] self.accept_next_alert = True def crawl(self): print("see") driver = self.driver driver.execute_script("window.scrollTo(0, 800)") driver.find_element_by_id("dijit_MenuItem_3_text").click() driver.find_element_by_xpath('//td[.="All"]').click() driver.find_element_by_xpath('//td[.="Active EPA/LA (239)"]').click() soup = BeautifulSoup(driver.page_source, 'html.parser') headers = [] for m in soup.find_all("th"): headers.append(m.get_text()) print(headers) content = [] finalContent = [] filterList = ["km²"] for table in soup.find_all("table")[5::]: for row in table.find_all("tr"): contentCells = [] for cells in row.find_all("td"): if cells.text.split()not in filterList: contentCells.append(cells.text) content.append(contentCells) print(content) with open(filname, 'w', newline='') as f: writer = csv.writer(f) writer.writerow(headers) writer.writerows(content) driver.close() if __name__ == '__main__': obj = crawlOcean() obj.crawl()

['Station number', 'Station name', 'Waterbody', 'Status', 'Type of Gauge', 'Catchment area', 'Data Provider', 'River Basin', 'Timestamp', 'Value', 'Unit'] [['14107', 'BAYLOUGH BR.', 'BARROW', 'Active', 'Recorder', '431.50 km²', 'Environmental Protection Agency', 'Barrow', '25-10-2018 09:00', '58.419', 'm OD Malin (OSGM02)'], ['18118', 'SHANBALLYMORE', 'SPRING', 'Active', 'Recorder', '0.00 km²', 'Environmental Protection Agency', 'Blackwater (Munster)', '25-10-2018 09:00', '0.432', '---'], ['14108', 'BALLYNAFAGH', 'STREAM', 'Active', 'Recorder', '0.00 km²', 'Environmental Protection Agency', 'Barrow', '25-10-2018 09:00', '84.531', 'm OD Malin (OSGM02)'], ['14104', 'GREESEMOUNT', 'GREESE', 'Active', 'Recorder', '74.60 km²', 'Environmental Protection Agency', 'Barrow', '25-10-2018 08:00', '90.877', 'm OD Malin (OSGM02)'], ['14100', 'KYLE SPRING', 'SPRING', 'Active', 'Recorder', '0.00 km²', 'Environmental Protection Agency', 'Barrow', '17-10-2018 14:15', '91.595', 'm OD Malin (OSGM02)'], ['03059', 'DRUMULLY BR.', 'MOUNTAIN WATER', 'Active', 'Staff Gauge Only', '0.00 km²', 'Environmental Protection Agency', 'Blackwater (Ulster)', '01-10-2018 12:45', '100.280', 'm (TBM)'], ['03070', 'EMY LOUGH', 'EMY LOUGH', 'Active', 'Recorder', '', 'Environmental Protection Agency', 'Blackwater (Ulster)', '25-10-2018 09:00', '50.884', 'm OD (Poolbeg)'], ['03057', 'EMYVALE WEIR', 'MOUNTAIN WATER', 'Active', 'Recorder', '37.80 km²', 'Environmental Protection Agency', 'Blackwater (Ulster)', '25-10-2018 09:00', '52.431', 'm OD Malin (OSGM02)'], ['18048', 'DROMCUMMER', 'BLACKWATER [MUNSTER]', 'Active', 'Recorder', '867.70 km²', 'Environmental Protection Agency', 'Blackwater (Munster)', '25-10-2018 01:00', '60.605', 'm OD Malin (OSGM02)'], ['14057', 'TIMOLIN', 'BOTHOGUE', 'Active', 'Recorder', '18.20 km²', 'Environmental Protection Agency', 'Barrow', '25-10-2018 09:00', '89.953', 'm OD Malin (OSGM02)'], ['28011', 'DOO LOUGH OUTFLOW', '---', 'Active', 'Recorder', '22.81 km²', 'Environmental Protection Agency', 'Annageeragh-Annagh-Creegh', '25-10-2018 09:00', '80.743', 'm OD Malin (OSGM02)'], ['07071', 'SKEAGH', 'SKEAGH L.', 'Active', 'Recorder', '5.30 km²', 'Environmental Protection Agency', 'Boyne', '15-08-2018 14:00', '149.503', 'm OD (Poolbeg)'], ['03051', 'FAULKLAND', 'BLACKWATER (MONAGHAN)', 'Active', 'Recorder', '143.20 km²', 'Environmental Protection Agency', 'Blackwater (Ulster)', '19-09-2018 17:00', '41.572', 'm OD Malin (OSGM02)'], ['07077', 'WHITE LOUGH.', 'ANNAGH OR WHITE LOUGH', 'Active', 'Recorder', '0.00 km²', 'Environmental Protection Agency', 'Boyne', '05-09-2018 13:30', '105.169', 'm OD Malin (OSGM02)'], ['10038', 'DRUIDS GLEN', 'STREAM', 'Active', 'Recorder', '16.00 km²', 'Environmental Protection Agency', 'Potters-Redcross-Three Mile Water', '25-10-2018 09:00', '19.377', 'm OD Malin (OSGM02)'], ['36031', 'LISDARN', 'CAVAN', 'Active', 'Recorder', '63.80 km²', 'Environmental Protection Agency', 'Erne', '25-10-2018 09:00', '52.317', 'm OD Malin (OSGM02)'], ['07074', 'BALLANY', 'LENE L.', 'Active', 'Recorder', '13.00 km²', 'Environmental Protection Agency', 'Boyne', '25-10-2018 09:00', '92.667', 'm OD Malin (OSGM02)'], ['16047', 'CARROWCLOGH', 'ARA', 'Active', 'Recorder', '44.10 km²', 'Environmental Protection Agency', 'Suir', '25-10-2018 08:00', '80.820', 'm OD Malin (OSGM02)'], ['30012', 'CLAREGALWAY', 'CLARE', 'Active', 'Recorder', '1072.90 km²', 'Environmental Protection Agency', 'Corrib', '24-10-2018 20:30', '6.355', 'm OD Malin (OSGM02)'], ['16045', 'BALLYSHONOCK RESRVR.', 'DAWN', 'Active', 'Recorder', '5.20 km²', 'Environmental Protection Agency', 'Suir', '09-08-2018 13:15', '89.683', 'm OD Malin (OSGM02)'], ['07078', 'LOUGH BANE', 'BANE L.', 'Active', 'Recorder', '', 'Environmental Protection Agency', '---', '25-10-2018 09:00', '111.051', 'm OD Malin (OSGM02)'], ['10028', 'KNOCKNAMOHILL', 'AUGHRIM', 'Active', 'Recorder', '203.00 km²', 'Environmental Protection Agency', 'Avoca', '24-10-2018 20:00', '21.322', 'm OD Malin (OSGM02)'], ['29071', 'CUTRA', 'L. CUTRA', 'Active', 'Recorder', '123.80 km²', 'Environmental Protection Agency', 'Kinvarra', '25-10-2018 08:15', '32.694', 'm OD Malin (OSGM02)'], ['25070', 'WHITEBRIDGE.', 'L.ENNELL', 'Active', 'Recorder', '147.80 km²', 'Environmental Protection Agency', 'Shannon', '25-10-2018 09:00', '78.934', 'm OD Malin (OSGM15)'], ['29018', 'CLARINBRIDGE SPRING', 'SPRING', 'Active', 'Recorder', '', 'Environmental Protection Agency', 'Kilcogan', '27-09-2018 12:30', '2.926', 'm OD Malin (OSGM02)'], ['25072', 'CAPTAIN S HILL', 'OWEL L.', 'Active', 'Recorder', '22.60 km²', 'Environmental Protection Agency', 'Shannon', '25-10-2018 09:00', '96.146', 'm OD Malin (OSGM15)'], ['10021', 'COMMON S ROAD', 'SHANGANAGH', 'Active', 'Recorder', '32.50 km²', 'Environmental Protection Agency', 'Loughlinstown', '25-10-2018 08:00', '10.806', 'm OD (Poolbeg)'], ['26204', 'BALLYMARTIN', 'HIND', 'Active', 'Recorder', '44.70 km²', 'Environmental Protection Agency', 'Shannon', '25-10-2018 09:00', '40.009', 'm OD Malin (OSGM02)'], ['33001', 'GLENAMOY', 'GLENAMOY', 'Active', 'Recorder', '76.10 km²', 'Environmental Protection Agency', 'Glenamoy-Ballinglen-Glencullen', '25-10-2018 09:00', '3.901', 'm OD Malin (OSGM15)'], ['22031', 'KILLARNEY SW (New)', 'L. LEANE TRIB', 'Active', 'Recorder', '0.06 km²', 'Environmental Protection Agency', 'Laune', '21-08-2018 11:15', '19.570', 'm OD Malin (OSGM02)'], ['32076', 'DOO LOUGH', 'DOO LOUGH [MAYO]', 'Active', 'Recorder', '0.00 km²', 'Environmental Protection Agency', 'Owenglin-Dawros-Culin-Traheen', '03-09-2018 11:30', '30.237', 'm OD Malin (OSGM02)'], ['32073', 'LETTERETTRIM', 'L. FEE', 'Active', 'Recorder', '15.70 km²', 'Environmental Protection Agency', 'Carrownisky-Owenwee-Carrowbeg', '25-10-2018 09:00', '44.589', 'm OD Malin (OSGM15)'], ['32070', 'L.FEEAGH', 'L. FEEAGH', 'Active', 'Recorder', '84.30 km²', 'Environmental Protection Agency', 'Srahmore', '25-10-2018 09:00', '10.852', 'm OD Malin (OSGM15)'], ['32026', 'BUNDORRAGHA', 'BUNDORRAGHA', 'Active', 'Recorder', '48.30 km²', 'Environmental Protection Agency', 'Owenglin-Dawros-Culin-Traheen', '25-10-2018 09:00', '3.358', 'm OD Malin (OSGM15)'], ['25046', 'LISMOYNY', 'BROSNA', 'Active', 'Recorder', '304.50 km²', 'Environmental Protection Agency', 'Shannon', '25-10-2018 09:00', '57.021', 'm OD Malin (OSGM15)'], ['35073', 'L.GILL', 'L.GILL', 'Active', 'Recorder', '362.60 km²', 'Environmental Protection Agency', 'Garvogue', '25-10-2018 09:00', '3.823', 'm OD Malin (OSGM15)'], ['25044', 'COOLE', 'KILMASTULLA', 'Active', 'Recorder', '92.54 km²', 'Environmental Protection Agency', 'Shannon', '25-10-2018 08:15', '29.649', 'm OD Malin (OSGM02)'], ['35072', 'TRASGARVE', 'L. EASKY', 'Active', 'Recorder', '10.70 km²', 'Environmental Protection Agency', 'Easky-Dunneil', '06-09-2018 10:45', '183.646', 'm OD Malin (OSGM02)']

3条回答

网友

1楼 · 编辑于 2024-10-02 16:22:08

要删除字符串但保留值吗？在这种情况下，您应该：

# just before you append the row
contentCells[5] = re.sub("[^0-9.,]", "", contentCells[5])
content.append(contentCells)

或者如果您想跳过该列：

# remove column before add the row
del contentCells[5]

# or filter 

filterList = {"km²"} # create a set with all words
for table in soup.find_all("table")[5::]:
    for row in table.find_all("tr"):
        contentCells = []
        for cells in row.find_all("td"):
            # create a new set with the cells text, 
            # and intersect with the filter set, if the cell doesn't 
            # contains any filtered text word, it will return an empty set  
            if not set(cells.text.split()).intersection(filterList):
                contentCells.append(cells.text)
        content.append(contentCells)

网友

2楼 · 编辑于 2024-10-02 16:22:08

如果你有一个字符串列表，其中一些字符串以结尾，例如

my_list = ['14107', 'BAYLOUGH BR.', '25-10-2018 09:00', '58.419', 'm OD Malin (OSGM02)', '0.00 km²']

你可以试试

my_list = [item for item in my_list if not item.endswith(" km²")]

除去以" km²"结尾的元素

更新

如果要从元素中删除" km²"部分，请尝试：

my_list = [item.rstrip(" km²") for item in my_list]

对你来说可能是

contentCells = [cell.text.rstrip(" km²") for cell in row.find_all("td") if cell.text.strip()]

网友

3楼 · 编辑于 2024-10-02 16:22:08

谢谢大家，我已经用下面的方法解决了

filtter = ['km²']
        for table in soup.find_all("table")[5::]:
            for row in table.find_all("tr"):
                contentCells = []
                for cells in row.find_all("td"):
                    contentCells.append(cells.get_text())
            content.append(contentCells)
        for idx, v in enumerate(content):
            for t in filtter:
                content[idx] = [i.replace(t, '') for i in content[idx]]
                #.append(timerecorded)
        print(content)

相关问题更多 >

编程相关推荐

热门问题

热门文章