如何处理1.61亿行的文件？

class Graph(defaultdict): def __init__(self, input_file, sep=" ", header=False, undirect=True): super(Graph, self).__init__(dict) self.edges_num = 0 with open(input_file) as f: if header: f.readline() for line in f: line = line.strip().split(sep) self[line[0]][line[1]] = float(line[2]) self.edges_num += 1 if undirect: self[line[1]][line[0]] = float(line[2]) self.edges_num += 1 def edges(self): edges_list = [] for node1 in self: for node2 in self[node1]: edges_list.append((node1, node2)) return edges_list

def edges_weight(self): weight_list = [] for edge in self.edges(): node1, node2 = edge weight_list.append([node1, node2, self[node1][node2]]) weight_list = sorted(weight_list, key=lambda x:x[2]) return weight_list def get_weight(self, node1, node2): return self[node1][node2] def get_weights(self): weights = [] for edge in self.edges(): weights.append(self.get_weight(edge[0], edge[1])) return weights if __name__=="__main__": input_file = "./data/mydata.dat" percent = 2.0 output_file = "./data/results" G = Graph(input_file) position = round(G.number_of_edges()*percent/100) dc = G.edges_weight()[position][2] print("average percentage of neighbours (hard coded): {}".format(percent)) print("Computing Rho with gaussian kernel of radius: {}".format(dc)) nodes = G.nodes() for i in range(G.number_of_nodes()-1): for j in range(i+1, G.number_of_nodes()): node_i = nodes[i] node_j = nodes[j] dist_ij = G.get_weight(node_i, node_j)

bigfile = open(input_file,'r') tmp_lines = bigfile.readlines(1024*1024) for line in tmp_lines: line = line.strip().split(sep) self[line[0]][line[1]] = float(line[2]) self.edges_num += 1 if undirect: self[line[1]][line[0]] = float(line[2]) self.edges_num += 1

1条回答

网友
1楼 · 发布于 2024-09-27 17:49:48

最直接的答案是不要一次加载整个文件。这甚至可以一次完成一行。例如，假设您想要求和：
filename = 'file.dat' lines = (int(line.split(' ')[2]) for line in open(filename)) print(sum(lines))
在这里，我们没有将所有的行加载到内存中。相反，我们打开了一个文件指针并启动了一个python生成器。生成器保存函数“int（line.split（“”）[2]），仅在调用每一行时执行该函数。需要调用每一行的启动由sum（）启动，sum只会根据需要一次调用每一行，一次不会将多行加载到内存中。因此，当我们执行该行时，我们开始将来自生成器的行上的所有值相加，并保持一个运行总数。关键是代码不使用内存RAM（除了内核开销）
这也可以一次做一件。加载所有的零
filename = 'file.dat' lines = (line.split(' ') for line in open(filename)) zeros = (line for line in lines if line[0]=='0' or line[1]=='0') print(sum(c for a,b,c in zeros))
这当然比将部分或全部文件加载到内存中要慢。此外，你必须考虑你想在这样的文件上迭代多少次。最好只在这些行上迭代几次，收集所需的所有计算结果。然后，您可能希望保存这些答案，因为再次迭代文件需要更多时间
在考虑将文件加载到内存中时，需要仔细检查要加载的内容以及加载方式。例如，是否要在第1 2 26.23行中加载值1 2？如果没有，则将其去掉以占用更少的内存。比如说
import numpy as np filename = 'file.dat' values = (float(line.split(' ')[2]) for line in open(filename)) X = np.fromiter(values,dtype='float32',count=161991000)
通过指定计数，我们告诉python要预先分配多少内存（而不是让python在每次需要更多内存时重新调整数组）。有了这个大小的计数和float32的数据类型，我们知道这个数据将在RAM中占用647.97mb的空间。因此，请注意不要编写任何复制此数据的操作。如果你写的东西，使这5份副本，将消耗内存很快
我认为这会让您了解如何管理内存。：-）

相关问题更多 >

编程相关推荐

热门问题

热门文章