python中从迭代行和混合行中提取字符串

2024-10-03 13:29:55 发布

您现在位置:Python中文网/ 问答频道 /正文

我有一个数据集如下

"birth_date_1:25        birth_date_2:august     birth_date_3:1945    birth_place_1:france   death_date:<none>   "
"birth_date_1:14        birth_date_2:june       birth_date_3:1995   birth_place_1:dvůr     birth_place_2:králové     birth_place_3:nad       birth_place_4:labem     birth_place_5:,     birth_place_6:czech     birth_place_7:republic  "
"birth_date_1:21        birth_date_2:february       birth_date_3:1869   birth_place_1:blackburn     birth_place_2:,     birth_place_3:england   death_date_1:12     death_date_2:march      death_date_3:1917   "
"birth_date_1:07        birth_date_2:july       birth_date_3:1979   birth_place_1:ghana     birth_place_2:,     birth_place_3:accra "
"birth_date_1:27        birth_date_2:february       birth_date_3:1979   birth_place_1:durban        birth_place_2:,     birth_place_3:south     birth_place_4:africa    "
"birth_date_1:1989  birth_place_1:lima      birth_place_2:,     birth_place_3:peru  "
"birth_date_1:5     birth_date_2:september      birth_date_3:1980   birth_place_1:angola    death_date:<none>   "
"birth_date_1:1     birth_date_2:february       birth_date_3:1856   birth_place_1:hampstead     birth_place_2:,     birth_place_3:london    death_date_1:14     death_date_2:august     death_date_3:1905   "
"birth_date_1:28        birth_date_2:december       birth_date_3:1954   birth_place_1:hickory       birth_place_2:,     birth_place_3:north     birth_place_4:carolina  death_date:<none>   "
"birth_date:<none>  "
"birth_date:<none>  birth_place:<none>  death_date:<none>   "
"birth_date:<none>  birth_place_1:belfast       birth_place_2:,     birth_place_3:northern      birth_place_4:ireland   "
"birth_date:<none>  birth_place:<none>  death_date:<none>   "
"birth_date_1:28        birth_date_2:february       birth_date_3:1891   birth_place_1:carberry      birth_place_2:,     birth_place_3:manitoba  death_date_1:20     death_date_2:september      death_date_3:1968   "
"birth_date_1:4     birth_date_2:november       birth_date_3:1993   birth_place_1:portim√£o     birth_place_2:,     birth_place_3:portugal  "

在这些数据集中,我试图提取如下信息

25.08.1945 \t France \t NA
14.06.1995 \t Dvůr Králové nad Labem,Czech Republic \t 
21.02.1896 \t Blackburn,England \t 12.03.1917
.
.
.
1989 \t Lima,Peru \t NA
.
.
.
NA \t NA \t NA
NA \t NA \t NA
NA \t Belfast, Northern Ireland \t NA
.
.
04.11.1993 \t Portimeo,Portugal \t NA

我写了下面的代码来实现这一点,但是因为我会在我的数据集中遇到一些情况,比如出生日期信息可以是空的,一个月的名字或者一年,下面的循环让我觉得在某个地方会失败,不可行。你知道吗

    outputfile = open('ornek_box_seperated_update.csv','w',encoding="utf-8")
    inputfile = open('ornek_box_seperated.csv','r',encoding="utf-8")
    import numpy as np

    birthDatePlace = [[ np.nan for i in range(9) ] for j in range(20000)]

    for line in inputfile:
        d = line.split(":")
        print(d)
        d = line.split(d)
        d = "\t".join(d)
        print(d)
        if(d[1]<40 and d[1]>0):
            birthDatePlace[line,1] = d[1]
        elif(d[1]<2020):
            birthDatePlace[line,3] = d[1]
        if(d[1]<40 and d[1]>0 and isinstance(d[3])==str):
            birthDatePlace[line,2] = d[3]
        elif(d[1]<2020 and isinstance(d[3])==int):
            birthDatePlace[line,4] = d[3]

        # this code planned to continue from here until cover the all birth place and death date information in required format

        outputfile.write(d)
        outputfile.write('\n')
    outputfile.close()

我很感激你能提供的任何帮助。我是python的新手,尤其是正则表达式或字符串提取方法。你知道吗

提前感谢您的支持。你知道吗


Tags: and数据innonefordatelineplace
2条回答

如果您想避免代码中断,最好进行显式检查。请检查下面的代码。我已经解析了信息并将其存储在一个类对象中。这个类有一些帮助函数来修改解析的数据。你知道吗

# -*- coding: utf-8 -*-

# Class for storing parsed information
class Info(object):
    def __init__(self, birth_date_1, birth_date_2, birth_date_3, birth_place, death_date_1, death_date_2, death_date_3):
        if not (birth_date_1 or birth_date_2 or birth_date_3):
            self.birth_date = "NA"
        else:
            if birth_date_2 and birth_date_2.isalpha():
                birth_date_2 = self.month_string_to_number(birth_date_2)
            self.birth_date = '.'.join([birth_date_1, birth_date_2, birth_date_3]).strip(".")

        self.birth_place = birth_place if birth_place.strip(",") else "NA"

        if not (death_date_1 or death_date_2 or death_date_3):
            self.death_date = "NA"
        else:
            if death_date_2 and death_date_2.isalpha():
                death_date_2 = self.month_string_to_number(death_date_2)
            self.death_date = '.'.join([death_date_1, death_date_2, death_date_3]).strip(".")

        self.sanitize()

    def print_req_format(self):
        print '\t'.join([self.birth_date, self.birth_place, self.death_date])

    def sanitize(self):
        if "<none>" in self.birth_date:
            self.birth_date = "NA"
        if "<none>" in self.birth_place:
            self.birth_place = "NA"
        if "<none>" in self.death_date:
            self.death_date = "NA"

    def month_string_to_number(self, month):
        m = {
            'jan': 1,
            'feb': 2,
            'mar': 3,
            'apr': 4,
            'may': 5,
            'jun': 6,
            'jul': 7,
            'aug': 8,
            'sep': 9,
            'oct': 10,
            'nov': 11,
            'dec': 12
        }
        s = month.strip()[:3].lower()

        try:
            out = m[s]
            return str(out)
        except:
            return ""

dataset = [
"birth_date_1:25        birth_date_2:august     birth_date_3:1945    birth_place_1:france   death_date:<none>",
"birth_date_1:14        birth_date_2:june       birth_date_3:1995   birth_place_1:dvůr     birth_place_2:králové     birth_place_3:nad       birth_place_4:labem     birth_place_5:,     birth_place_6:czech     birth_place_7:republic",
"birth_date_1:21        birth_date_2:february       birth_date_3:1869   birth_place_1:blackburn     birth_place_2:,     birth_place_3:england   death_date_1:12     death_date_2:march      death_date_3:1917",
"birth_date_1:07        birth_date_2:july       birth_date_3:1979   birth_place_1:ghana     birth_place_2:,     birth_place_3:accra",
"birth_date_1:27        birth_date_2:february       birth_date_3:1979   birth_place_1:durban        birth_place_2:,     birth_place_3:south     birth_place_4:africa",
"birth_date_1:1989  birth_place_1:lima      birth_place_2:,     birth_place_3:peru",
"birth_date_1:5     birth_date_2:september      birth_date_3:1980   birth_place_1:angola    death_date:<none>",
"birth_date_1:1     birth_date_2:february       birth_date_3:1856   birth_place_1:hampstead     birth_place_2:,     birth_place_3:london    death_date_1:14     death_date_2:august     death_date_3:1905",
"birth_date_1:28        birth_date_2:december       birth_date_3:1954   birth_place_1:hickory       birth_place_2:,     birth_place_3:north     birth_place_4:carolina  death_date:<none>",
"birth_date:<none>",
"birth_date:<none>  birth_place:<none>  death_date:<none>",
"birth_date:<none>  birth_place_1:belfast       birth_place_2:,     birth_place_3:northern      birth_place_4:ireland",
"birth_date:<none>  birth_place:<none>  death_date:<none>",
"birth_date_1:28        birth_date_2:february       birth_date_3:1891   birth_place_1:carberry      birth_place_2:,     birth_place_3:manitoba  death_date_1:20     death_date_2:september      death_date_3:1968",
"birth_date_1:4     birth_date_2:november       birth_date_3:1993   birth_place_1:portim√£o     birth_place_2:,     birth_place_3:portugal",
]

for line in dataset:
    split_data_line = line.split()
    birth_date_1 = birth_date_2 = birth_date_3 = birth_place = death_date_1 = death_date_2 = death_date_3 = ""
    for data in split_data_line:
        split_data = data.split(":")
        if len(split_data) < 2:
            continue

        val = split_data[1]
        if data.startswith("birth_date_1"):
            birth_date_1 = val
        elif data.startswith("birth_date_2"):
            birth_date_2 = val
        elif data.startswith("birth_date_3"):
            birth_date_3 = val
        elif data.startswith("birth_place"):
            if not birth_place or val == ",":
                birth_place += val
            else:
                birth_place += " " + val
        elif data.startswith("death_date_1"):
            death_date_1 = val
        elif data.startswith("death_date_2"):
            death_date_2 = val
        elif data.startswith("death_date_3"):
            death_date_3 = val

    info = Info(birth_date_1, birth_date_2, birth_date_3, birth_place, death_date_1, death_date_2, death_date_3)
    info.print_req_format()

根据您提供的数据,此代码的输出为:

25.8.1945   france  NA
14.6.1995   dvůr králové nad labem, czech republic   NA
21.2.1869   blackburn, england  12.3.1917
07.7.1979   ghana, accra    NA
27.2.1979   durban, south africa    NA
1989    lima, peru  NA
5.9.1980    angola  NA
1.2.1856    hampstead, london   14.8.1905
28.12.1954  hickory, north carolina NA
NA  NA  NA
NA  NA  NA
NA  belfast, northern ireland   NA
NA  NA  NA
28.2.1891   carberry, manitoba  20.9.1968
4.11.1993   portim√£o, portugal NA

代码很容易理解。希望这对你有用。干杯。你知道吗

import csv

FIELDNAMES = ('birth_date', 'birth_place', 'death_date')

with open('infile', 'r') as f:
    result = []
    for line in f:
        record = {k: '' for k in FIELDNAMES}
        for kv in line.strip('" \n').split():
            k, v = kv.split(':')
            if v == '<none>':
                continue
            key = k.rstrip('_0123456789')
            value = ' ' + v if record[key] and v != ',' else v
            record[key] += value
        result.append(record)

with open('outfile.csv', 'w') as f:
    writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
    writer.writeheader()
    writer.writerows(result)

'outfile.csv'

birth_date,birth_place,death_date
25 august 1945,france,
14 june 1995,"dvůr králové nad labem, czech republic",
21 february 1869,"blackburn, england",12 march 1917
07 july 1979,"ghana, accra",
27 february 1979,"durban, south africa",
1989,"lima, peru",
5 september 1980,angola,
1 february 1856,"hampstead, london",14 august 1905
28 december 1954,"hickory, north carolina",
,,
,,
,"belfast, northern ireland",
,,
28 february 1891,"carberry, manitoba",20 september 1968
4 november 1993,"portim√£o, portugal",

相关问题 更多 >