比较两个巨大的csv文件时的性能改进[python]

with open("file1.csv",'r') as csv1: with open("out.csv", 'w') as myoutput: writer = csv.writer(myoutput) row_count=0 headerSet=0 for row in csv.reader(csv1): with open ("file2.csv",'r') as csv2: in2 = csv.reader(csv2) for mrow in in2: if row_count == 0 and headerSet==0: # Generate Header Row for the output csv file writer.writerow(row+["Col3"]) headerSet=1 else: # Code to fetch timestamp from csv1 and csv2 if csv1_ts == csv2_ts: # Fetch 2nd column value from csv2 val=mrow[1] writer.writerow(row+[val]) break else: continue row_count += 1

1条回答

网友

1楼 · 发布于 2024-09-30 14:21:33

因为行似乎是按时间排序的，所以您可以从两个文件中最初读取一行。如果行的时间戳匹配，则将该行写入输出并前进到两个文件中的下一行。如果时间戳不同，则从当前时间戳较小的文件中读取下一行。下面是代码的简单实现：

import csv

def get_key(row):
    date = [int(x) for x in row[0].split('/')]
    date[0], date[2] = date[2], date[0]
    return date, row[1]

with open('file1.csv') as csv1, open('file2.csv') as csv2, open('out.csv', 'w') as out:
    csv1 = csv.reader(csv1)
    csv2 = csv.reader(csv2)
    out = csv.writer(out)

    # Header
    out.writerow(next(csv1) + ['Col3'])
    row1 = next(csv1, None)
    row2 = next(csv2, None)

    while row1 and row2:
        key1 = get_key(row1)
        key2 = get_key(row2)
        if key1 < key2:
            row1 = next(csv1, None)
        elif key1 > key2:
            row2 = next(csv2, None)
        else:
            out.writerow(row1 + row2[-1:])
            row1 = next(csv1, None)
            row2 = next(csv2, None)

相关问题更多 >

编程相关推荐

热门问题

热门文章