为什么我的文件中会丢失字段？

#!usr/bin/env python # -*- coding: utf-8 -*- f = open('datos_terr.csv', 'rb') fout = open('salida.csv', 'w') lines = f.readlines() first = lines[0].strip("\r\n") fout.write(lines[0] + "\n") for line in lines[1:]: """ Removing tab characters, used to separate the values. Then I insert NULL values between them for uknown fields. I add "" characters to strings to make WEKA able to accept them, and I put the separation value. I remove the ending tabs and they are subtituted by commas. I write the line to the output file and close both. Elimino los caracteres de tabulación, que son los que representan la separación. Luego los separo por ellos tras añadir el NULL para los campos de los que no conozco los datos. Añado comillas a las cadenas de texto para que WEKA las acepte y añado el caracter de separación. Elimino las tabulaciones que me sobren al final y luego los sustituyo por comas. Las escribo al fichero de salida y cierro ambos. """ line = line.strip("\r\n") line = line.replace("'", "") line = line.replace("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", "\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", "\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", "\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", "\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", "\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", "\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", "\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", "\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", "\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", "\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", "\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t\t\t\t\t\t\t\t\t\t\t\t", "\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t\t\t\t\t\t\t\t\t\t\t", "\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t\t\t\t\t\t\t\t\t\t", "\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t\t\t\t\t\t\t\t\t", "\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t\t\t\t\t\t\t\t", "\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t\t\t\t\t\t\t", "\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t\t\t\t\t\t", "\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t\t\t\t\t", "\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t\t\t\t", "\tNULL\tNULL\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t\t\t", "\tNULL\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t\t", "\tNULL\tNULL\tNULL\t") line = line.replace("\t\t\t", "\tNULL\tNULL\t") line = line.replace("\t\t", "\tNULL\t") new_line = "" data = line.split("\t") for word in data: word = word.strip(" ") word = word.replace(" ", "") if word.isspace(): word = "NULL" if "," in word: new_line += '"' + word + '"' else: if not word.isdigit() and not word == "NULL" and not isinstance(word, float) and not word == "": new_line += '"' + word + '"\t' else: new_line += word + "\t" new_line = new_line.strip('\t') new_line = new_line.replace("\t", ",") fout.write(new_line + "\n") f.close() fout.close()

1条回答

网友

1楼 · 发布于 2024-06-28 10:57:40

我将使用csv模块获得字段列表，并处理这些字段。更干净的代码通常更容易发现错误。不使用csv模块也可以做同样的事情，但是该模块已经可以说几种不同的格式了——例如，它将自动引用包含分隔符的字段，因此您不需要进行if "," in word:检查。您还可以通过一个简单的选项来检查文档，看看是否有其他检查为您处理：https://docs.python.org/2/library/csv.html

您的代码为每行创建了一个新字符串，因此我刚刚为每行创建了一个新列表，作为编写代码的等效方法：

with open('datos_terr.csv', 'rb') as incsv, open('salida.csv', 'wb') as outcsv:
    # Read from the first, saying that tab is the field delimiter
    myreader = csv.reader(incsv, delimiter='\t')
    # , is the default, here for explanation
    mywriter = csv.writer(outcsv, delimiter=',')
    for row in myreader:
        # row is a list of the fields.
        newrow = list()
        for field in row:
            # No spaces allowed in fields
            field = field.strip()
            field = field.replace(' ', '')
            # single quotes to be removed, as per original code
            field = field.replace("'", '')
            if len(field) < 1:
                field = 'NULL'
            newrow.append(field)
        mywriter.writerow(newrow)
        # print ', '.join(newrow)

相关问题更多 >

编程相关推荐

热门问题

热门文章