如何在两个CSV文件之间传输“公共”列

import csv import collections csv_file1 = "file1.csv" csv_file2 = "file2.csv" data1 = list(csv.reader(file(csv_file1,'r'))) data2 = list(csv.reader(file(csv_file2,'r'))) file1_header = data1[0][:] #get the header from file1 file2_header = data2[0][:] #get the header from file2 lowered_file1_header = [item.lower() for item in file1_header] #lowercase file1 header lowered_file2_header = [item.lower() for item in file2_header] #lowercase file2 header anyways col_index_dict = {} for column in lowered_file1_header: if column == "subjectprefix": # identify "subjectprefix" column in file1.csv col_index_dict[column] = lowered_file1_header.index(column) elif column == "subjectfirstname": # identify "subjectfirstname" column in file1.csv col_index_dict[column] = lowered_file1_header.index(column) elif column in file2_header: # identify the columns with same naming col_index_dict[column] = lowered_file1_header.index(column) else: col_index_dict[column] = -1 # mark the not matching columns # Build header output = [col_index_dict.keys()] is_header = True for row in data1: if is_header is False: rowData = [] for column in col_index_dict: column_index = col_index_dict[column] if column_index != -1: rowData.append(row[column_index]) else: rowData.append('') output.append(rowData) else: is_header = False print(output)

2条回答

网友

1楼 · 编辑于 2024-09-28 22:24:26

欢迎使用编程。让我给你介绍一下神奇的pandas library。你知道吗

在我的脑子里，有件事可以解决你的问题。（我不是说它效率高！因此，对于大型数据集，这可能是一个问题）

import pandas as pd

df = pd.read_csv('file1.csv')
df2 = pd.read_Csv('file2.csv')

df_columns = set(list(df.columns))
df2_columns = set(list(df2.columns))

common_columns = list(df_columns.intersection(df2_columns))

common_df = df[common_columns]
common_df2 = df2[common_colmns]

## At this point you have the common columns for both CSV's. if you want
## to make them into one, just use df concatenate / append. else, you can save both of them like this:

common_df.to_csv('common1.csv')
common_df2.to_csv('common2.csv')

网友

2楼 · 编辑于 2024-09-28 22:24:26

谢谢Wboy你的贡献，你的意见真的很有用。你知道吗

我利用熊猫图书馆找到了解决这个问题的办法。代码如下：

import pandas as pd

# read the csv files
df = pd.read_csv('file1.csv')
df2 = pd.read_csv('file2.csv')

# lowercase the headers
df.columns = df.columns.str.lower()
df2.columns = df2.columns.str.lower()

df_columns = set(list(df.columns))
df2_columns = set(list(df2.columns))

识别并传输“公共”列：

for col in list(df_columns):
    for col2 in list(df2_columns):
        if col == "subjectprefix" and col2 =="prefix_name":
            # copy the data from df["subjectprefix"] column to df2["prefix_name"] column in df2 dataframe
            df2["prefix_name"] = df['subjectprefix']
            df3 = [col2]
        elif col == "subjectfirstname" and col2 =="first_name":
            # copy the data from "subjectfirstname" column to "first_name" column
            df2["first_name"] = df["subjectfirstname"]
            df3.append(col2)

        elif col =="subjectlastname" and col2 =="last_name":
            #copy the data from "subjectfirstname" column to "last_name" column
            df2["last_name"] = df["subjectlastname"]
            df3.append(col2)

        elif col == col2:
            # copy the exactly matching to df2
            df2[col2] = df[col]
            df3.append(col2)

从数据帧df2中删除“不常见”列：

for col2 in list(df2_columns):
if not col2 in df3:
    del df2[col2]

# print the output
df2.set_index("id",inplace=True)
print df2

将输出另存为.csv文件：

df2.to_csv('output.csv')

我相信这不是一个最佳的解决方案，我希望代码可以在识别和传输“公共”列方面得到改进。我的代码中充满了if/elif语句，我确信这里一定有更好的实现方法。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章