Python: IOError 110 Connection timed out when reading from dis

from datasketch import MinHash from multiprocessing import Pool from collections import defaultdict from nltk import ngrams import json import sys import codecs import config cores = 24 window_len = 12 step = 4 worker_files = 50 permutations = 256 hashband_len = 4 def minhash_text(args): '''Return a list of hashband strings for an input doc''' try: file_id, path = args with codecs.open(path, 'r', 'utf8') as f: f = f.read() all_hashbands = [] for window_idx, window in enumerate(ngrams(f.split(), window_len)): window_hashbands = [] if window_idx % step != 0: continue minhash = MinHash(num_perm=permutations, seed=1) for ngram in set(ngrams(' '.join(window), 3)): minhash.update( ''.join(ngram).encode('utf8') ) hashband_vals = [] for i in minhash.hashvalues: hashband_vals.append(i) if len(hashband_vals) == hashband_len: window_hashbands.append( '.'.join([str(j) for j in hashband_vals]) ) hashband_vals = [] all_hashbands.append(window_hashbands) return {'file_id': file_id, 'hashbands': all_hashbands} except Exception as exc: print(' ! error occurred while processing', file_id, exc) return {'file_id': file_id, 'hashbands': []} if __name__ == '__main__': file_ids = json.load(open('file_ids.json')) file_id_path_tuples = [(file_id, path) for file_id, path in file_ids.items()] worker_id = int(sys.argv[1]) worker_ids = list(ngrams(file_id_path_tuples, worker_files))[worker_id] hashband_to_ids = defaultdict(list) pool = Pool(cores) for idx, result in enumerate(pool.imap(minhash_text, worker_ids)): print(' * processed', idx, 'results') file_id = result['file_id'] hashbands = result['hashbands'] for window_idx, window_hashbands in enumerate(hashbands): for hashband in window_hashbands: hashband_to_ids[hashband].append(file_id + '.' + str(window_idx)) with open(config.out_dir + 'minhashes-' + str(worker_id) + '.json', 'w') as out: json.dump(dict(hashband_to_ids), out)

1条回答

网友

1楼 · 发布于 2024-09-21 05:44:00

结果发现我对文件系统的冲击太大了，对同一台服务器上的文件进行了太多的并发读取请求。该服务器在给定的时间段内只能允许固定数量的读取，因此任何超过该限制的请求都会收到连接超时响应。在

解决方案是在while循环中包装每个文件读取请求。在while循环中，尝试从磁盘读取适当的文件。如果出现连接超时错误，请睡眠一秒钟，然后重试。只有当文件被读取后，while循环才能被中断。在

相关问题更多 >

编程相关推荐

热门问题

热门文章