将文本文件拆分为较小的文件

"GROUP";"DetailA1";"DetailA2";"DetailA3";"DetailA4" "PRINT";"1" "BodyA1";"BodyA2";"BodyA3" "BodyB1";"BodyB2";"BodyB3" "BodyC1";"BodyC2";"BodyC3" "BodyD1";"BodyD2";"END" "GROUP";"DetailB1";"DetailB2";"DetailB3";"DetailB4" "PRINT";"1" "BodyA1";"BodyA2";"BodyA3" "BodyB1";"BodyB2";"BodyB3" "BodyC1";"BodyC2";"BodyC3" "BodyD1";"BodyD2";"END" "GROUP";"DetailC1";"DetailC2";"DetailC3";"DetailC4" "PRINT";"1" "BodyA1";"BodyA2";"BodyA3" "BodyB1";"BodyB2";"BodyB3" "BodyC1";"BodyC2";"BodyC3" "BodyD1";"BodyD2";"END" "GROUP";"DetailD1";"DetailD2";"DetailD3";"DetailD4" "PRINT";"1" "BodyA1";"BodyA2";"BodyA3" "BodyB1";"BodyB2";"BodyB3" "BodyC1";"BodyC2";"BodyC3" "BodyD1";"BodyD2";"END" "GROUP";"DetailE1";"DetailE2";"DetailE3";"DetailE4" "PRINT";"1" "BodyA1";"BodyA2";"BodyA3" "BodyB1";"BodyB2";"BodyB3" "BodyC1";"BodyC2";"BodyC3" "BodyD1";"BodyD2";"END" "GROUP";"DetailF1";"DetailF2";"DetailF3";"DetailF4" "PRINT";"1" "BodyA1";"BodyA2";"BodyA3" "BodyB1";"BodyB2";"BodyB3" "BodyC1";"BodyC2";"BodyC3" "BodyD1";"BodyD2";"END"

import re import math # Path to file input_text = "input.txt" # Empty list containing parsed lists of text parsed = [] # Empty list containing single parsed list lastblock = [] # define the "beginning of new block" pattern newblockregex = re.compile('^"GROUP.*') # Create blocks of text as list of lists with open(input_text) as textfile: for line in textfile.readlines(): if newblockregex.match(line.rstrip('\n')): if lastblock: parsed.append(lastblock) lastblock = [] lastblock = [line.rstrip('\n')] else: lastblock.append(line.rstrip('\n')) parsed.append(lastblock) ''' End of blocking of text''' # Get total number of lines sumlen = sum([len(rec) for leng, rec in enumerate(parsed)]) print(f"Total rows of record: {sumlen}") # Function to calculate number of resulting files def maxPrimeFactors (n): # Initialize the maximum prime factor # variable with the lowest one maxPrime = -1 # Print the number of 2s that divide n while n % 2 == 0: maxPrime = 2 n >>= 1 # equivalent to n /= 2 # n must be odd at this point, # thus skip the even numbers and # iterate only for odd integers for i in range(3, int(math.sqrt(n)) + 1, 2): while n % i == 0: maxPrime = i n = n / i # This condition is to handle the # case when n is a prime number # greater than 2 if n > 2: maxPrime = n return int(maxPrime) # Count of blocks (forms/records) in the file # Gets number of blocks in the raw file formnum = len(parsed) # Number of resulting files splitsnum = maxPrimeFactors(formnum) blocksPerFile = round(formnum/splitsnum) print(f"There are {formnum} forms.") # Prints to user the number of blocks print(f"Number of forms per output file: {blocksPerFile}") # Split records into new file def slice_per(parsed, blocksPerFile): for record in parsed: counter = 1 with open(f'small_file_{counter + blocksPerFile}.txt', 'w+') as output: for L in record: output.write(L) counter+=1 print(f"File number {parsed.index(record)}") output.close() print ("Done!") slice_per(parsed, blocksPerFile)

1条回答

网友

1楼 · 发布于 2024-09-30 05:26:24

change in slice method counter should be outside of loop

def slice_per(parsed, blocksPerFile):
    counter = 1
    for record in parsed:
        with open('small_file_%s.txt'%(counter + blocksPerFile), 'w') as output:
            for L in record:
                output.write(L)
        counter+=1
        print("File number %s"%(parsed.index(record)))
    output.close()


    print ("Done!")

相关问题更多 >

编程相关推荐

热门问题

热门文章