在Python中读取缓冲区中块的最节省内存的方法

449319.34;6242700.23;0.38;1;1;1;0;0;42;25;3;17;482375.326087;20224;23808;23808 449310.72;6242700.22;0.35;3;1;1;0;0;42;23;3;17;482375.334291;20480;24576;24576 449313.81;6242700.66;0.39;1;1;1;0;0;42;24;3;17;482375.342666;20224;24576;24576 449298.37;6242700.27;0.39;1;1;1;0;0;42;21;3;17;482375.350762;18176;22784;23552 449287.47;6242700.06;0.39;11;1;1;0;0;42;20;3;17;482375.358921;20736;24832;24832 449290.11;6242700.21;0.35;1;1;1;0;0;42;20;3;17;482375.358962;19968;24064;23808 449280.48;6242700.08;0.33;1;1;1;0;0;42;18;3;17;482375.367142;22528;25856;26624 449286.97;6242700.44;0.36;3;1;1;0;0;42;19;3;17;482375.367246;19712;23552;23296 449293.03;6242700.78;0.37;1;1;1;0;0;42;21;3;17;482375.367342;19456;23296;23808 449313.36;6242701.92;0.38;6;1;1;0;0;42;24;3;17;482375.367654;19968;24576;24576 449277.48;6242700.17;0.34;8;1;1;0;0;42;18;3;17;482375.375420;20224;23808;25088 449289.46;6242700.85;0.31;3;1;1;0;0;42;20;3;17;482375.375611;18944;23040;23040

from __future__ import division import os import glob import tempfile import sys def print_flulsh(n, maxvalue = None): sys.stdout.write("\r") if maxvalue is None: sys.stdout.write("Laser points processed: %d" % n) else: sys.stdout.write("%d of %d laser points processed" % (n, maxvalue)) sys.stdout.flush() def point_grid_id(x, y, minx, maxy, size): """give id (row,col)""" col = int((x - minx) / size) row = int((maxy - y) / size) return row, col def tempfile_tile_name(line, temp_dir, minx, maxy, size, parse): x, y = line.split(parse)[:2] row, col = point_grid_id(float(x), float(y), minx, maxy, size) return os.path.normpath(os.path.join(temp_dir + os.sep,"tempfile_%s_%s.tmp" % (row, col))) # split the text file in small text files following the ID value given by tempfile_tile_name # where: # filename : name+path of text file # temp_dir: temporary folder # minx, maxy: origin of the grid (left-up corner) # size: size of the grid # parse: delimeter of the text file # num: number of lines (~ 12 millions) def tempfile_split(filename, temp_dir, minx, maxy, size, parse, num): index = 1 with open(filename) as file: while True: lines = file.readlines(100000) if not lines: break for line in lines: print_flulsh(index, num) index += 1 name = tempfile_tile_name(line, temp_dir, minx, maxy, size, parse) with open(name, 'a') as outfile: outfile.write(line)

1条回答

网友

1楼 · 发布于 2024-05-06 08:06:10

代码中的瓶颈不在于读取，而在于每读一行就打开和关闭一个输出文件。在评论中你提到了你的最终目标：分割后，我需要再次打开每个文件，并随机选择一行。

theodox提到了一种可能的方法，获取每个ID的第一个条目，然后在内存中随机重写它。请注意，覆盖必须以概率1/n发生，其中n是到目前为止看到的具有相同ID的行数，以避免对后面的样本产生偏差。在

编辑。您可以通过对文件执行两次传递来节省内存。第一个过程生成一组由随机选择排除的行号，第二个过程处理未排除的行。在

from random import random

def random_selection(filename, temp_dir, minx, maxy, size, parse, num):
    selection = {}
    excluded = set()
    with open(filename) as file:
        for i, line in enumerate(file):
            x, y, _ = line.split(parse, 2)
            row_col = point_grid_id(float(x), float(y), minx, maxy, size)
            try:
                n, selected_i = selection[row_col]
            except KeyError:
                selection[row_col] = 1, i
            else:
                n += 1
                if random() < 1.0 / n:
                    excluded.add(selected_i)
                    selected_i = i
                selection[row_col] = n, selected_i

    with open(filename) as file:
        for i, line in enumerate(file):
            if i not in excluded:
                #process the line

相关问题更多 >

编程相关推荐

热门问题

热门文章