Python：将两个CSV文件合并为多级JSON

3条回答

网友

1楼 · 编辑于 2024-09-28 23:32:50

分步骤进行：

读取传入的tsv文件并将来自不同基因的信息聚合到字典中。在
过程说字典要符合你想要的格式。在
将结果写入JSON文件。在

代码如下：

import csv
import json
from collections import defaultdict

input_files = ['f1.tsv', 'f2.tsv']
output_file = 'genes.json'

# Step 1
gene_dict = defaultdict(lambda: defaultdict(list))
for file in input_files:
    with open(file, 'r') as f:
        reader = csv.DictReader(f, delimiter='\t')
        for line in reader:
            gene = line.pop('gene')
            sample = line.pop('sample')
            gene_dict[gene][sample].append(line)

# Step 2
out = [{'gene': gene,
        'samples': [{'sample': sample, 'extras': extras}
                    for sample, extras in samples.items()]}
       for gene, samples in gene_dict.items()]

# Step 3
with open(output_file, 'w') as f:
    json.dump(out, f)

网友

2楼 · 编辑于 2024-09-28 23:32:50

还有一个选择。当你开始添加更多文件时，我试着让它更容易管理。可以在命令行上运行并提供参数，每个要添加的文件都有一个参数。基因/样本名称存储在字典中以提高效率。所需JSON对象的格式化是在每个类的format（）方法中完成的。希望这有帮助。在

import csv, json, sys

class Sample(object):
    def __init__(self, name, extras):
        self.name = name
        self.extras = [extras]

    def format(self):
        map = {}
        map['sample'] = self.name
        map['extras'] = self.extras
        return map

    def add_extras(self, extras):
        #edit 8/20
        #always just add the new extras to the list
        for extra in extras:
            self.extras.append(extra)

class Gene(object):
    def __init__(self, name, samples):
        self.name = name
        self.samples = samples

    def format(self):
        map = {}
        map ['gene'] = self.name
        map['samples'] = sorted([self.samples[sample_key].format() for sample_key in self.samples], key=lambda sample: sample['sample'])
        return map

    def create_or_add_samples(self, new_samples):
        # loop through new samples, seeing if they already exist in the gene object
        for sample_name in new_samples:
            sample = new_samples[sample_name]
            if sample.name in self.samples:
                self.samples[sample.name].add_extras(sample.extras)
            else:
                self.samples[sample.name] = sample

class Genes(object):
    def __init__(self):
        self.genes = {}

    def format(self):
        return sorted([self.genes[gene_name].format() for gene_name in self.genes], key=lambda gene: gene['gene'])

    def create_or_add_gene(self, gene):
        if not gene.name in self.genes:
            self.genes[gene.name] = gene
        else:
            self.genes[gene.name].create_or_add_samples(gene.samples)

def row_to_gene(headers, row):
    gene_name = ""
    sample_name = ""
    extras = {}
    for value in enumerate(row):
        if headers[value[0]] == "gene":
            gene_name = value[1]
        elif headers[value[0]] == "sample":
            sample_name = value[1]
        else:
            extras[headers[value[0]]] = value[1]
    sample_dict = {}
    sample_dict[sample_name] = Sample(sample_name, extras)
    return Gene(gene_name, sample_dict)

if __name__ == '__main__':
    delim = "\t"
    genes = Genes()
    files = sys.argv[1:]

    for file in files:
        print("Reading " + str(file))
        with open(file,'r') as f1:
            reader = csv.reader(f1, delimiter=delim)
            headers = []
            for row in reader:
                if len(headers) == 0:
                    headers = row
                else:
                    genes.create_or_add_gene(row_to_gene(headers, row))

    result = json.dumps(genes.format(), indent=4)
    print(result)
    with open('json_output.txt', 'w') as output:
        output.write(result)

网友

3楼 · 编辑于 2024-09-28 23:32:50

这看起来是pandas的问题！不幸的是，熊猫只带我们走了这么远，然后我们不得不自己做一些操作。这既不是快速的，也不是特别高效的代码，但它可以完成任务。在

import pandas as pd
import json
from collections import defaultdict

# here we import the tsv files as pandas df
f1 = pd.read_table('f1.tsv', delim_whitespace=True)
f2 = pd.read_table('f2.tsv', delim_whitespace=True)

# we then let pandas merge them
newframe = f1.merge(f2, how='outer', on=['gene', 'sample'])

# have pandas write them out to a json, and then read them back in as a
# python object (a list of dicts)
pythonList = json.loads(newframe.to_json(orient='records'))


newDict = {}
for d in pythonList:
    gene = d['gene']
    sample = d['sample']
    sampleDict = {'sample':sample,
                  'extras':[]}

    extrasdict = defaultdict(lambda:dict())

    if gene not in newDict:
        newDict[gene] = {'gene':gene, 'samples':[]}

    for key, value in d.iteritems():
        if 'other' not in key or value is None:
            continue
        else:
            id = key.split('other')[-1]
            if len(id) == 1:
                extrasdict['1'][key] = value
            else:
                extrasdict['{}'.format(id[0])][key] = value

    for value in extrasdict.values():
        sampleDict['extras'].append(value)

    newDict[gene]['samples'].append(sampleDict)

newList = [v for k, v in newDict.iteritems()]

print json.dumps(newList)

如果这看起来是一个解决方案，将为您工作，我很高兴花一些时间清理它，使其诱饵更加可读和高效。在

PS：如果你喜欢R，那么pandas就是最好的选择（它是为了给python中的数据提供一个类似R的接口）

相关问题更多 >

编程相关推荐

热门问题

热门文章

Python：将两个CSV文件合并为多级JSON

相关问题 更多 >

编程相关推荐

热门问题

热门文章

相关问题更多 >