代码不能在大数据上运行

2024-09-29 23:23:28 发布

您现在位置:Python中文网/ 问答频道 /正文

这是在我的系统上运行的代码,代码运行良好,数据量很小,但同样的代码不能处理大量的数据集或文本文件,是的@ennikiller是对的,一些文件是空的(意味着它们没有Esgn值,但它们有其他值),我想按如下方式处理它,它应该给Esgn赋值为零,或者只计算所有其他值的最小值,并留下Esgn(在不包含Esgn的特定文件中,不是所有的)

    import os.path
    import glob
    import re
    import itertools
    from collections import namedtuple, deque
    from operator import attrgetter

    R_PREFIX_VALUE = re.compile(r'^(?P<prefix>[A-Z]+)(?P<suffix>\d+)\s+(?P<value>\d+)\s*$')

    getvalue  = attrgetter('value')

    def interleave(seq, val):
        return itertools.chain.from_iterable(itertools.izip(seq, itertools.repeat(val)))

    class Fileline(namedtuple('Fileline', 'filename prefix suffix value')):
        @classmethod
        def _fromstr(cls, s, filename=None, rematch=R_PREFIX_VALUE.match):
            m = rematch(s)
            if not m:
                raise ValueError('No valid line found in %r' % s)
            d = m.groupdict()
            d['value'] = int(d['value'])
            d['filename'] = filename
            return cls(**d)

        def _asstr(self):
            return '{}{} {}'.format(self.prefix, self.suffix, self.value)

    def max_value_with_prefix(lineseq, prefix, getvalue=getvalue):
        withprefix = (line for line in lineseq if line.prefix==prefix)
        return max_value(withprefix)

    def filter_lt_line(lineseq, maxline):
        for line in lineseq:
            if line.prefix != maxline.prefix or line.value >= maxline.value:
                yield line

    def extreme_value(fn, lineseq, getvalue=getvalue):
        try:
            return fn((l for l in lineseq if l is not None), key=getvalue)
        except ValueError:
            return None

    def max_value(lineseq):
        return extreme_value(max, lineseq)

    def min_value(lineseq):
        return extreme_value(min, lineseq)

    def read_lines(fn, maker=Fileline._fromstr):
        with open(fn, 'rb') as f:
            return deque(maker(l, fn) for l in f)

    def write_file(fn, lineseq):
        lines = (l._asstr() for l in lineseq)
        newlines = interleave(lines, '\n')
        with open(fn, 'wb') as f:
            f.writelines(newlines)

    def write_output_file(fn, lineseq):
        lines = ("{} {}".format(l.filename, l.value) for l in lineseq)
        newlines = interleave(lines, "\n")
        with open(fn, 'wb') as f:
            f.writelines(newlines)

    def filter_max_returning_min(fn, prefix):
        lineseq = read_lines(fn)
        maxvalue = max_value_with_prefix(lineseq, prefix)
        filteredlineseq = deque(filter_lt_line(lineseq, maxvalue))
        write_file(fn, filteredlineseq)
        minline = min_value(filteredlineseq)
        return minline

    def main(fileglob, prefix, outputfile):
        minline = None
        for fn in glob.iglob(fileglob):
            fileminline = filter_max_returning_min(fn, prefix)
            minline = min_value([minline, fileminline])
        write_output_file(outputfile, [minline])
    def _worker(args):
        return filter_max_returning_min(*args)

    """def multi_main(fileglob, prefix, outputfile, processes):
        from multiprocessing import Pool
        pool = Pool(processes=processes)
        workerargs = ((fn, prefix) for fn in glob.iglob(fileglob))
        minlines = pool.imap_unordered(_worker, workerargs, processes)
        minline = min_value(minlines)
        write_file(outputfile, [minline])"""
    def main(fileglob, prefix, outputfile):
        minlines = []
        for fn in glob.iglob(fileglob):
            minlines.append(filter_max_returning_min(fn, prefix))
        write_output_file(outputfile, minlines)
    main('C:\Python27\DataSet\*.txt', 'ENSG', 'output.txt') 

这是我的代码,这个代码在10/20文本文件上运行得很好,但是当我运行3000个txt文件时,它会给我以下错误

    Traceback (most recent call last):
      File "C:\Users\Ir\Desktop\S_Project\Finding_min_val_and_escaping_'Esgn'\dataset_code.py", line 95, in <module>
        main('C:\Python27\DataSet\*.txt', 'ENSG', 'output.txt')
      File "C:\Users\Ir\Desktop\S_Project\Finding_min_val_and_escaping_'Esgn'\dataset_code.py", line 93, in main
        minlines.append(filter_max_returning_min(fn, prefix))
      File "C:\Users\Ir\Desktop\S_Project\Finding_min_val_and_escaping_'Esgn'\dataset_code.py", line 69, in filter_max_returning_min
        filteredlineseq = deque(filter_lt_line(lineseq, maxvalue))
      File "C:\Users\Ir\Desktop\S_Project\Finding_min_val_and_escaping_'Esgn'\dataset_code.py", line 35, in filter_lt_line
        if line.prefix != maxline.prefix or line.value >= maxline.value:
    AttributeError: 'NoneType' object has no attribute 'prefix'
lines = ("{} {}".format(l.filename, l.value) for l in lineseq)
AttributeError: 'NoneType' object has no attribute 'filename'

Tags: inimportforprefixreturnvaluedefline
1条回答
网友
1楼 · 发布于 2024-09-29 23:23:28

尝试打印line和maxline的值

很明显,其中一个被设置为“无”。很可能是麦克斯林,如果事实确实如此的话。所以,找出你为什么不通过maxline。所以你应该看看这里:

try:
    lineseq = read_lines(fn)
    maxvalue = max_value_with_prefix(lineseq, prefix)
    filteredlineseq = deque(filter_lt_line(lineseq, maxvalue))
    write_file(fn, filteredlineseq)
    minline = min_value(filteredlineseq)
except:
    print "%s is apparently empty" % fn
    minline = None
finally:
    return milline

有没有可能fn是一个空文件?你知道吗

正如Joran所说,您只需要添加一些异常处理(我在上面添加了它)

相关问题 更多 >

    热门问题