如何从多个块一次读取两个连续块的数据，直到文件结束？

chr pos A_block A_val 2 05 7 A,T,C 2 11 7 T,C,G 2 15 7 AT,C,G 2 21 7 C,A,GT 2 31 7 T,C,CA 2 42 9 T,C,G 2 55 9 C,G,GC 2 61 9 A,GC,T 2 05 12 AC,TG,G 2 11 12 A,TC,TG

import req_packages from collections import defaultdict ''' make a function that takes data from two blocks at a time ''' def parse_two_blocks(someData): for key, vals in someData: do ... something write the obtained output clear memory # to prevent memory buildup ''' Now, read the input file''' with open('HaploBlock_toy.txt') as HaploBlocks: header = HaploBlocks.readline() # only reads the first line as header ''' create a empty dict or default dict. Which ever is better?''' Hap_Dict = {} Hap_Dict = defaultdict(list) ''' for rest of the lines ''' for lines in HaploBlocks: values = lines.strip('\n').split('\t') ''' append the data to the dict for unique keys on the for loop, until the number of unique keys is 2 ''' Block = values[2] Hap_Dict[Block].append(values[3]) do something to count the number of keys - how? if keys_count > 2: return parse_two_blocks(Hap_Dict) elif keys_count < 2 or no new keys: # This one is odd and won't work I know. end the program

1条回答

网友

1楼 · 发布于 2024-06-02 11:33:42

将输入解析为一个动态的块列表（生成器）。在对上迭代。这一切都应该在你评估配对时完成。也就是说，这些行都不应该一次读取或存储整个csv文件。你知道吗

#!/usr/bin/env python3

data = """chr   pos A_block A_val
2   05  7   A,T,C
2   11  7   T,C,G
2   15  7   AT,C,G
2   21  7   C,A,GT
2   31  7   T,C,CA
2   42  9   T,C,G
2   55  9   C,G,GC
2   61  9   A,GC,T
2   05  12  AC,TG,G
2   11  12  A,TC,TG"""

import csv
import io
import itertools
import collections
import operator
from pprint import pprint

def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)

def one():
    # read rows as tuples of values
    c = csv.reader(io.StringIO(data), dialect=csv.excel_tab)
    # read header row
    keys = next(c)
    block_index = keys.index('A_block')
    # group rows by block numbers
    blocks = itertools.groupby(c, key=operator.itemgetter(block_index))
    # extract just the row values for each block
    row_values = (tuple(v) for k, v in blocks)
    # rearrange the values by column
    unzipped_values = (zip(*v) for v in row_values)
    # create a dictionary for each block
    dict_blocks = (dict(zip(keys, v)) for v in unzipped_values)
    yield from pairwise(dict_blocks)


def two():
    c = csv.DictReader(io.StringIO(data), dialect=csv.excel_tab)
    blocks = itertools.groupby(c, key=lambda x: x['A_block'])
    yield from pairwise((k, list(v)) for k, v in blocks)


for a, b in one():
        pprint(a)
        pprint(b)
        print()

输出（of one）：

{'A_block': ('7', '7', '7', '7', '7'),
 'A_val': ('A,T,C', 'T,C,G', 'AT,C,G', 'C,A,GT', 'T,C,CA'),
 'chr': ('2', '2', '2', '2', '2'),
 'pos': ('05', '11', '15', '21', '31')}
{'A_block': ('9', '9', '9'),
 'A_val': ('T,C,G', 'C,G,GC', 'A,GC,T'),
 'chr': ('2', '2', '2'),
 'pos': ('42', '55', '61')}

{'A_block': ('9', '9', '9'),
 'A_val': ('T,C,G', 'C,G,GC', 'A,GC,T'),
 'chr': ('2', '2', '2'),
 'pos': ('42', '55', '61')}
{'A_block': ('12', '12'),
 'A_val': ('AC,TG,G', 'A,TC,TG'),
 'chr': ('2', '2'),
 'pos': ('05', '11')}

^{}

Take a string and return a file-like object that contains the contents of string.

^{}来自^{}

Returns an ordered dict for each row where the field names taken from the very first row are used as dictionary keys for the field values.

^{}

Make an iterator that returns consecutive keys and groups from the iterable. The key is a function computing a key value for each element.

lambda x: x['A_block']

A temporary function that takes an input named x and returns the value for the key 'A_block'

(k, list(v)) for k, v in blocks

groupby() returns an iterator (that can only be used once) for the values. This converts that iterator to a list.

^{} recipe

"s -> (s0,s1), (s1,s2), (s2, s3), ..."

相关问题更多 >

编程相关推荐

热门问题

热门文章