从记录列表中筛选单元:BS4 Python

2024-10-02 16:22:08 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在从这个platform中删除表。检索完所有信息后,我成功地将它们存储到一个列表中。接下来,我要做的是从包含记录的列表中过滤单位(km2)

例如:

集水区的测量值为0.1 km2,我想删除该列中所有记录的km2

我使用的方法是:

# -*- coding utf-8 -*-
from selenium.webdriver.firefox.options import Options
from selenium import webdriver
import time
import os
import shutil
from bs4 import BeautifulSoup
import uuid
import csv
import dateutil.parser as parser
import pandas as pd

from selenium.webdriver.support.select import Select


class crawlOcean():

    def __init__(self):
        print("hurray33")
        global downloadDir
        global uFileName
        global filname
        downloadDir = ""
        uFileName = str(uuid.uuid4())
        filname = downloadDir + uFileName + ".csv"
        pd.set_option('display.max_rows', 500)
        pd.set_option('display.max_columns', 500)
        pd.set_option('display.width', 1000)
        fp = webdriver.FirefoxProfile()
        fp.set_preference("browser.download.folderList", 2)
        fp.set_preference("browser.download.manager.showWhenStarting", False)
        fp.set_preference("browser.download.dir", downloadDir)
        fp.set_preference("browser.helperApps.neverAsk.saveToDisk",
                          "attachment/csv")
        options = Options()
        options.add_argument("--headless")
        self.driver = webdriver.Firefox(firefox_profile=fp)
        #self.driver = webdriver.Firefox()
        print("hurray")
        self.driver.implicitly_wait(15)
        self.driver.get("http://www.epa.ie/hydronet/#Water%20Levels")
        self.verificationErrors = []
        self.accept_next_alert = True

    def crawl(self):
        print("see")
        driver = self.driver
        driver.execute_script("window.scrollTo(0, 800)")
        driver.find_element_by_id("dijit_MenuItem_3_text").click()
        driver.find_element_by_xpath('//td[.="All"]').click()
        driver.find_element_by_xpath('//td[.="Active EPA/LA (239)"]').click()
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        headers = []
        for m in soup.find_all("th"):
            headers.append(m.get_text())
        print(headers)
        content = []
        finalContent = []
        filterList = ["km²"]
        for table in soup.find_all("table")[5::]:
            for row in table.find_all("tr"):
                contentCells = []
                for cells in row.find_all("td"):
                    if cells.text.split()not in filterList:
                        contentCells.append(cells.text)
                content.append(contentCells)


        print(content)
        with open(filname, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(headers)
            writer.writerows(content)
        driver.close()



if __name__ == '__main__':
    obj = crawlOcean()
    obj.crawl()

样本输出:

['Station number', 'Station name', 'Waterbody', 'Status', 'Type of Gauge', 'Catchment area', 'Data Provider', 'River Basin', 'Timestamp', 'Value', 'Unit']
[['14107', 'BAYLOUGH BR.', 'BARROW', 'Active', 'Recorder', '431.50 km²', 'Environmental Protection Agency', 'Barrow', '25-10-2018 09:00', '58.419', 'm OD Malin (OSGM02)'], ['18118', 'SHANBALLYMORE', 'SPRING', 'Active', 'Recorder', '0.00 km²', 'Environmental Protection Agency', 'Blackwater (Munster)', '25-10-2018 09:00', '0.432', '---'], ['14108', 'BALLYNAFAGH', 'STREAM', 'Active', 'Recorder', '0.00 km²', 'Environmental Protection Agency', 'Barrow', '25-10-2018 09:00', '84.531', 'm OD Malin (OSGM02)'], ['14104', 'GREESEMOUNT', 'GREESE', 'Active', 'Recorder', '74.60 km²', 'Environmental Protection Agency', 'Barrow', '25-10-2018 08:00', '90.877', 'm OD Malin (OSGM02)'], ['14100', 'KYLE SPRING', 'SPRING', 'Active', 'Recorder', '0.00 km²', 'Environmental Protection Agency', 'Barrow', '17-10-2018 14:15', '91.595', 'm OD Malin (OSGM02)'], ['03059', 'DRUMULLY BR.', 'MOUNTAIN WATER', 'Active', 'Staff Gauge Only', '0.00 km²', 'Environmental Protection Agency', 'Blackwater (Ulster)', '01-10-2018 12:45', '100.280', 'm (TBM)'], ['03070', 'EMY LOUGH', 'EMY LOUGH', 'Active', 'Recorder', '', 'Environmental Protection Agency', 'Blackwater (Ulster)', '25-10-2018 09:00', '50.884', 'm OD (Poolbeg)'], ['03057', 'EMYVALE WEIR', 'MOUNTAIN WATER', 'Active', 'Recorder', '37.80 km²', 'Environmental Protection Agency', 'Blackwater (Ulster)', '25-10-2018 09:00', '52.431', 'm OD Malin (OSGM02)'], ['18048', 'DROMCUMMER', 'BLACKWATER [MUNSTER]', 'Active', 'Recorder', '867.70 km²', 'Environmental Protection Agency', 'Blackwater (Munster)', '25-10-2018 01:00', '60.605', 'm OD Malin (OSGM02)'], ['14057', 'TIMOLIN', 'BOTHOGUE', 'Active', 'Recorder', '18.20 km²', 'Environmental Protection Agency', 'Barrow', '25-10-2018 09:00', '89.953', 'm OD Malin (OSGM02)'], ['28011', 'DOO LOUGH OUTFLOW', '---', 'Active', 'Recorder', '22.81 km²', 'Environmental Protection Agency', 'Annageeragh-Annagh-Creegh', '25-10-2018 09:00', '80.743', 'm OD Malin (OSGM02)'], ['07071', 'SKEAGH', 'SKEAGH L.', 'Active', 'Recorder', '5.30 km²', 'Environmental Protection Agency', 'Boyne', '15-08-2018 14:00', '149.503', 'm OD (Poolbeg)'], ['03051', 'FAULKLAND', 'BLACKWATER (MONAGHAN)', 'Active', 'Recorder', '143.20 km²', 'Environmental Protection Agency', 'Blackwater (Ulster)', '19-09-2018 17:00', '41.572', 'm OD Malin (OSGM02)'], ['07077', 'WHITE LOUGH.', 'ANNAGH OR WHITE LOUGH', 'Active', 'Recorder', '0.00 km²', 'Environmental Protection Agency', 'Boyne', '05-09-2018 13:30', '105.169', 'm OD Malin (OSGM02)'], ['10038', 'DRUIDS GLEN', 'STREAM', 'Active', 'Recorder', '16.00 km²', 'Environmental Protection Agency', 'Potters-Redcross-Three Mile Water', '25-10-2018 09:00', '19.377', 'm OD Malin (OSGM02)'], ['36031', 'LISDARN', 'CAVAN', 'Active', 'Recorder', '63.80 km²', 'Environmental Protection Agency', 'Erne', '25-10-2018 09:00', '52.317', 'm OD Malin (OSGM02)'], ['07074', 'BALLANY', 'LENE L.', 'Active', 'Recorder', '13.00 km²', 'Environmental Protection Agency', 'Boyne', '25-10-2018 09:00', '92.667', 'm OD Malin (OSGM02)'], ['16047', 'CARROWCLOGH', 'ARA', 'Active', 'Recorder', '44.10 km²', 'Environmental Protection Agency', 'Suir', '25-10-2018 08:00', '80.820', 'm OD Malin (OSGM02)'], ['30012', 'CLAREGALWAY', 'CLARE', 'Active', 'Recorder', '1072.90 km²', 'Environmental Protection Agency', 'Corrib', '24-10-2018 20:30', '6.355', 'm OD Malin (OSGM02)'], ['16045', 'BALLYSHONOCK RESRVR.', 'DAWN', 'Active', 'Recorder', '5.20 km²', 'Environmental Protection Agency', 'Suir', '09-08-2018 13:15', '89.683', 'm OD Malin (OSGM02)'], ['07078', 'LOUGH BANE', 'BANE L.', 'Active', 'Recorder', '', 'Environmental Protection Agency', '---', '25-10-2018 09:00', '111.051', 'm OD Malin (OSGM02)'], ['10028', 'KNOCKNAMOHILL', 'AUGHRIM', 'Active', 'Recorder', '203.00 km²', 'Environmental Protection Agency', 'Avoca', '24-10-2018 20:00', '21.322', 'm OD Malin (OSGM02)'], ['29071', 'CUTRA', 'L.    CUTRA', 'Active', 'Recorder', '123.80 km²', 'Environmental Protection Agency', 'Kinvarra', '25-10-2018 08:15', '32.694', 'm OD Malin (OSGM02)'], ['25070', 'WHITEBRIDGE.', 'L.ENNELL', 'Active', 'Recorder', '147.80 km²', 'Environmental Protection Agency', 'Shannon', '25-10-2018 09:00', '78.934', 'm OD Malin (OSGM15)'], ['29018', 'CLARINBRIDGE SPRING', 'SPRING', 'Active', 'Recorder', '', 'Environmental Protection Agency', 'Kilcogan', '27-09-2018 12:30', '2.926', 'm OD Malin (OSGM02)'], ['25072', 'CAPTAIN S HILL', 'OWEL L.', 'Active', 'Recorder', '22.60 km²', 'Environmental Protection Agency', 'Shannon', '25-10-2018 09:00', '96.146', 'm OD Malin (OSGM15)'], ['10021', 'COMMON S ROAD', 'SHANGANAGH', 'Active', 'Recorder', '32.50 km²', 'Environmental Protection Agency', 'Loughlinstown', '25-10-2018 08:00', '10.806', 'm OD (Poolbeg)'], ['26204', 'BALLYMARTIN', 'HIND', 'Active', 'Recorder', '44.70 km²', 'Environmental Protection Agency', 'Shannon', '25-10-2018 09:00', '40.009', 'm OD Malin (OSGM02)'], ['33001', 'GLENAMOY', 'GLENAMOY', 'Active', 'Recorder', '76.10 km²', 'Environmental Protection Agency', 'Glenamoy-Ballinglen-Glencullen', '25-10-2018 09:00', '3.901', 'm OD Malin (OSGM15)'], ['22031', 'KILLARNEY SW (New)', 'L. LEANE TRIB', 'Active', 'Recorder', '0.06 km²', 'Environmental Protection Agency', 'Laune', '21-08-2018 11:15', '19.570', 'm OD Malin (OSGM02)'], ['32076', 'DOO LOUGH', 'DOO LOUGH [MAYO]', 'Active', 'Recorder', '0.00 km²', 'Environmental Protection Agency', 'Owenglin-Dawros-Culin-Traheen', '03-09-2018 11:30', '30.237', 'm OD Malin (OSGM02)'], ['32073', 'LETTERETTRIM', 'L.    FEE', 'Active', 'Recorder', '15.70 km²', 'Environmental Protection Agency', 'Carrownisky-Owenwee-Carrowbeg', '25-10-2018 09:00', '44.589', 'm OD Malin (OSGM15)'], ['32070', 'L.FEEAGH', 'L.    FEEAGH', 'Active', 'Recorder', '84.30 km²', 'Environmental Protection Agency', 'Srahmore', '25-10-2018 09:00', '10.852', 'm OD Malin (OSGM15)'], ['32026', 'BUNDORRAGHA', 'BUNDORRAGHA', 'Active', 'Recorder', '48.30 km²', 'Environmental Protection Agency', 'Owenglin-Dawros-Culin-Traheen', '25-10-2018 09:00', '3.358', 'm OD Malin (OSGM15)'], ['25046', 'LISMOYNY', 'BROSNA', 'Active', 'Recorder', '304.50 km²', 'Environmental Protection Agency', 'Shannon', '25-10-2018 09:00', '57.021', 'm OD Malin (OSGM15)'], ['35073', 'L.GILL', 'L.GILL', 'Active', 'Recorder', '362.60 km²', 'Environmental Protection Agency', 'Garvogue', '25-10-2018 09:00', '3.823', 'm OD Malin (OSGM15)'], ['25044', 'COOLE', 'KILMASTULLA', 'Active', 'Recorder', '92.54 km²', 'Environmental Protection Agency', 'Shannon', '25-10-2018 08:15', '29.649', 'm OD Malin (OSGM02)'], ['35072', 'TRASGARVE', 'L.    EASKY', 'Active', 'Recorder', '10.70 km²', 'Environmental Protection Agency', 'Easky-Dunneil', '06-09-2018 10:45', '183.646', 'm OD Malin (OSGM02)']

有谁能帮我找到正确的方法吗


Tags: importselfdriveractiverecordersetagencykm
3条回答

要删除字符串但保留值吗?在这种情况下,您应该:

# just before you append the row
contentCells[5] = re.sub("[^0-9.,]", "", contentCells[5])
content.append(contentCells)

或者如果您想跳过该列:

# remove column before add the row
del contentCells[5]

# or filter 

filterList = {"km²"} # create a set with all words
for table in soup.find_all("table")[5::]:
    for row in table.find_all("tr"):
        contentCells = []
        for cells in row.find_all("td"):
            # create a new set with the cells text, 
            # and intersect with the filter set, if the cell doesn't 
            # contains any filtered text word, it will return an empty set  
            if not set(cells.text.split()).intersection(filterList):
                contentCells.append(cells.text)
        content.append(contentCells)

如果你有一个字符串列表,其中一些字符串以结尾,例如

my_list = ['14107', 'BAYLOUGH BR.', '25-10-2018 09:00', '58.419', 'm OD Malin (OSGM02)', '0.00 km²']

你可以试试

my_list = [item for item in my_list if not item.endswith(" km²")]

除去以" km²"结尾的元素

更新

如果要从元素中删除" km²"部分,请尝试:

my_list = [item.rstrip(" km²") for item in my_list]

对你来说可能是

contentCells = [cell.text.rstrip(" km²") for cell in row.find_all("td") if cell.text.strip()]

谢谢大家,我已经用下面的方法解决了

filtter = ['km²']
        for table in soup.find_all("table")[5::]:
            for row in table.find_all("tr"):
                contentCells = []
                for cells in row.find_all("td"):
                    contentCells.append(cells.get_text())
            content.append(contentCells)
        for idx, v in enumerate(content):
            for t in filtter:
                content[idx] = [i.replace(t, '') for i in content[idx]]
                #.append(timerecorded)
        print(content)

相关问题 更多 >