将文本文件拆分为较小的文件

2024-09-30 05:26:24 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在尝试将原始文本文件拆分为更小的文件,具体取决于有多少文本块或“记录”。 现在代码输出一个文本文件和最后一条记录,但我希望(在本例中)每个文件有一条记录。 可能需要一些帮助来编辑我的函数来循环包含每条记录的列表并写入一个新文件。你知道吗

你知道吗输入.txt你知道吗

"GROUP";"DetailA1";"DetailA2";"DetailA3";"DetailA4"
"PRINT";"1"
"BodyA1";"BodyA2";"BodyA3"
"BodyB1";"BodyB2";"BodyB3"
"BodyC1";"BodyC2";"BodyC3"
"BodyD1";"BodyD2";"END"
"GROUP";"DetailB1";"DetailB2";"DetailB3";"DetailB4"
"PRINT";"1"
"BodyA1";"BodyA2";"BodyA3"
"BodyB1";"BodyB2";"BodyB3"
"BodyC1";"BodyC2";"BodyC3"
"BodyD1";"BodyD2";"END"
"GROUP";"DetailC1";"DetailC2";"DetailC3";"DetailC4"
"PRINT";"1"
"BodyA1";"BodyA2";"BodyA3"
"BodyB1";"BodyB2";"BodyB3"
"BodyC1";"BodyC2";"BodyC3"
"BodyD1";"BodyD2";"END"
"GROUP";"DetailD1";"DetailD2";"DetailD3";"DetailD4"
"PRINT";"1"
"BodyA1";"BodyA2";"BodyA3"
"BodyB1";"BodyB2";"BodyB3"
"BodyC1";"BodyC2";"BodyC3"
"BodyD1";"BodyD2";"END"
"GROUP";"DetailE1";"DetailE2";"DetailE3";"DetailE4"
"PRINT";"1"
"BodyA1";"BodyA2";"BodyA3"
"BodyB1";"BodyB2";"BodyB3"
"BodyC1";"BodyC2";"BodyC3"
"BodyD1";"BodyD2";"END"
"GROUP";"DetailF1";"DetailF2";"DetailF3";"DetailF4"
"PRINT";"1"
"BodyA1";"BodyA2";"BodyA3"
"BodyB1";"BodyB2";"BodyB3"
"BodyC1";"BodyC2";"BodyC3"
"BodyD1";"BodyD2";"END"

你知道吗拆分.py你知道吗

import re
import math

# Path to file
input_text = "input.txt"

# Empty list containing parsed lists of text
parsed = []

# Empty list containing single parsed list
lastblock = []  

# define the "beginning of new block" pattern
newblockregex = re.compile('^"GROUP.*') 

# Create blocks of text as list of lists
with open(input_text) as textfile: 
    for line in textfile.readlines():
        if newblockregex.match(line.rstrip('\n')):
            if lastblock:
                parsed.append(lastblock)
                lastblock = []
            lastblock = [line.rstrip('\n')]
        else:
            lastblock.append(line.rstrip('\n'))
parsed.append(lastblock)
''' End of blocking of text'''

# Get total number of lines
sumlen = sum([len(rec) for leng, rec in enumerate(parsed)])
print(f"Total rows of record: {sumlen}")

# Function to calculate number of resulting files
def maxPrimeFactors (n):   
    # Initialize the maximum prime factor 
    # variable with the lowest one 
    maxPrime = -1

    # Print the number of 2s that divide n 
    while n % 2 == 0: 
        maxPrime = 2
        n >>= 1     # equivalent to n /= 2 

    # n must be odd at this point,  
    # thus skip the even numbers and  
    # iterate only for odd integers 
    for i in range(3, int(math.sqrt(n)) + 1, 2): 
        while n % i == 0: 
            maxPrime = i 
            n = n / i 

    # This condition is to handle the  
    # case when n is a prime number  
    # greater than 2 
    if n > 2: 
        maxPrime = n 

    return int(maxPrime) 

# Count of blocks (forms/records) in the file
# Gets number of blocks in the raw file
formnum = len(parsed)

# Number of resulting files
splitsnum = maxPrimeFactors(formnum)
blocksPerFile = round(formnum/splitsnum)
print(f"There are {formnum} forms.") # Prints to user the number of blocks
print(f"Number of forms per output file: {blocksPerFile}")


# Split records into new file
def slice_per(parsed, blocksPerFile):
    for record in parsed:
        counter = 1
        with open(f'small_file_{counter + blocksPerFile}.txt', 'w+') as output:
            for L in record:
                output.write(L)
        counter+=1
        print(f"File number {parsed.index(record)}")
    output.close()
    print ("Done!")

slice_per(parsed, blocksPerFile)

Tags: oftheinnumbergroupparsedprintlastblock
1条回答
网友
1楼 · 发布于 2024-09-30 05:26:24

change in slice method counter should be outside of loop

def slice_per(parsed, blocksPerFile):
    counter = 1
    for record in parsed:
        with open('small_file_%s.txt'%(counter + blocksPerFile), 'w') as output:
            for L in record:
                output.write(L)
        counter+=1
        print("File number %s"%(parsed.index(record)))
    output.close()


    print ("Done!")

相关问题 更多 >

    热门问题