Python：迭代CSV的每一行，计算每一行中的令牌，创建一个新的CSV，其中包含原始CSV每行的令牌数

import csv from textblob_de import TextBlobDE as TextBlob data = open('myInputFile.csv', encoding="utf-8").readlines() blob = TextBlob(str(data)) csv_file = open('myOutputFile.csv', 'w', encoding="utf-8") csv_writer = csv.writer(csv_file) # Define the Headers of the CSV csv_writer.writerow(['Text-ID', 'Tokens]) def numOfWordTokens(document): myList = [] for eachRow in document: myList.append(eachRow) return "\n".join(myList) #return eachRow #print(eachRow) # Count Tokens #countTokens = len(wordTokens2.split()) # Output: integer #return countTokens #myList.append(str(countTokens)) wordTokens = numOfWordTokens(data) # Write Content in the CSV-Table Rows csv_writer.writerow([wordTokens]) csv_file.close()

1条回答

网友

1楼 · 发布于 2024-09-28 22:25:18

使用pandas非常简单，但是如果您不想使用其他模块，也可以：）我为pandas和手动迭代数据添加了代码：

import pandas as pd
import csv


def main_pandas(path_to_csv: str, target_path: str):
    df = pd.read_csv(path_to_csv, encoding='utf-8')
    df['tokens'] = df['Content'].apply(lambda x: len(x.split()))
    sub_df = df[['ID', 'tokens']]
    sub_df.to_csv(target_path, index=False)


def main_manual(path_to_csv: str, target_path: str):
    with open(path_to_csv, 'r') as r_fp:
        csv_reader = csv.reader(r_fp)
        next(csv_reader)  # Skip headers
        with open(target_path, 'w') as w_fp:
            csv_writer = csv.writer(w_fp)
            csv_writer.writerow(['Text ID', 'tokens'])  # Write headers
            for line in csv_reader:
                text_id, text_content = line
                csv_writer.writerow([text_id, len(text_content.split())])


if __name__ == '__main__':
    main_manual('text.csv', 'tokens.csv')

相关问题更多 >

编程相关推荐

热门问题

热门文章