组合多个csv的有效方法

1条回答

网友

1楼 · 发布于 2024-10-03 21:35:15

可能的改进

方法1：使用熊猫

#df_t1 = pd.DataFrame()  
df_t1_lst = []
for i in tqdm(range(len(excelNames))):
    thisCSV = str(excelNames[i]).lower().strip()
 
    if thisCSV.endswith('type1.csv'):
        df = pd.read_csv(pathxl + "\\" + thisCSV, error_bad_lines=False, warn_bad_lines=False,low_memory=False)
        #df["File Name"] = pd.Series([thisCSV for x in range(len(df.index))])  unnecessary to loop use next line instead
        df["File Name"] = thisCSV             # places thisCSV in every row
        #df_t1 = pd.concat([df_t1,df], axis=0, ignore_index=True)          # concat slow, append to list instead
        df_t1_lst.append(df)
        
df_t1 = pd.concat(df_t1_lst, ignore_index=True)       # Form dataframe from list (faster than pd.concat in loop)
df_t1.to_csv(outpath + "df_t1.csv", index = None, header=True, encoding='utf-8') 
print("df_t1.csv generated")

方法1a

使用熊猫连续附加到CSV输出文件

import os
import pandas as pd

def str_to_bytes(s):
    ' String to byte array '
    result = bytearray()
    result.extend(map(ord, s))
    return result


def good_file(file_path):
    """ Check if file exists and is not empty"""
    # Check if file exist and it is empty
    return os.path.exists(file_path) and os.stat(file_path).st_size > 0
    
SEPARATOR = ','   # Separator used by CSV file
write_header = True

pathxl = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
outpath = 'xxxxxxxxxxxxxxxxxxxxxxxxxx'
excelNames = ["xxx.csv", "xxxxx.csv"]

pathxl = r"C:\\Users\\darryl\\OneDrive\\Python"
outpath = pathxl + r"\\"
excelNames = ["test1_type1.csv", "test2_type1.csv"]

output_file = outpath + "df_t1.csv"
with open(output_file, "w") as ofile:
    pass   # create empty output file

for i in tqdm(range(len(excelNames))):
    thisCSV = str(excelNames[i]).lower().strip()

    input_file = pathxl + "\\" + thisCSV
    if thisCSV.endswith('type1.csv') and good_file(input_file):
        df = pd.read_csv(input_file)
        if df.shape[0] > 0:
            df['File Name'] = thisCSV                # Add filename
            df = df.sort_index(axis = 1)             # sort based upon colunn in ascending order

            # Append to output file
            df.to_csv(output_file, mode='a', 
                      index = False, 
                      header= write_header)

            write_header = False                     # Only write header once
            
        del df

方法2：二进制文件

读取/写入二进制文件和使用内存映射应该更快

from tqdm import tqdm
import os
import mmap

def str_to_bytes(s):
    ' String to byte array '
    result = bytearray()
    result.extend(map(ord, s))
    return result

def good_file(file_path):
    """ Check if file exists and is not empty"""
    # Check if file exist and it is empty
    return os.path.exists(file_path) and os.stat(file_path).st_size > 0
    
SEPARATOR = ','   # Separator used by CSV file
header = None

pathxl = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
outpath = 'xxxxxxxxxxxxxxxxxxxxxxxxxx'
excelNames = ["xxx.csv", "xxxxx.csv"]

with open(outpath + "df_t1.csv", "wb") as ofile:
    for i in tqdm(range(len(excelNames))):
        thisCSV = str(excelNames[i]).lower().strip()

        input_file = pathxl + "\\" + thisCSV
        if thisCSV.endswith('type1.csv') and good_file(input_file):
            with open(input_file, "rb") as ifile:
                print('file ', thisCSV)
                # memory-map the file, size 0 means whole file
                with mmap.mmap(ifile.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
                    text_iter = iter(mmap_obj.read().split(b'\n'))

                if header is None:
                    header = next(text_iter)
                    header = header.rstrip() +  str_to_bytes(SEPARATOR + "File Name\n")
                    ofile.write(header)  # write header
                else:
                    next(text_iter)        # ignore header row

                # write data to output file
                file_value =  str_to_bytes(SEPARATOR + f"{thisCSV}\n")
                for line in text_iter:
                    if line.strip():             # skip blank lines
                        ofile.write(line.rstrip() + file_value)

相关问题更多 >

编程相关推荐

热门问题

热门文章

组合多个csv的有效方法

相关问题 更多 >

编程相关推荐

热门问题

热门文章

相关问题更多 >