多处理一个简单的Python foreach?

2024-09-29 23:31:53 发布

您现在位置:Python中文网/ 问答频道 /正文

开始学习python,我正在搜索如何提高已注释的循环START/END的执行速度。您认为多处理是最好的解决方案,还是有其他方法可以提高性能,因为只使用了10%的CPU

def merge_csv(names):
        n = 0
        while n < len(names[0]):
            _, filename["%s" %n] = os.path.split(names[0][n])
            temp_data["%s" %n] = pandas.read_csv(open(names[0][n]), sep=',|;')
        
            data["%s" %n] = pandas.DataFrame(temp_data["%s" %n])
            data["%s" %n].insert(1, "oround", 1)
            data["%s" %n].insert(3, "pround", 1)
            
            for i in range(len(data["%s" %n].mz)):
                data["%s" %n].iloc[i,1]=round(data["%s" %n].iloc[i,0],2)
                data["%s" %n].iloc[i,3]=round(data["%s" %n].iloc[i,2],1)

            liste = list(zip(data["%s" %n]["oround"], data["%s" %n]["pround"]))
            listeall.extend(liste)
            n = n+1
        
        liste2 = list(set([u for u in listeall]))
        
        merged = pandas.DataFrame(liste2)
        colonnes = ["o", "oround", "p", "pround", "td", "u", "counts"]
        m = 0
        j = 0
        for j in range(len(names[0])):
                name_data = os.path.basename(names[0][j])
                for m in range(len(colonnes)):
                    colm = colonnes[m] + name_data
                    merged.insert(len(colonnes)*j+2+m, colm, 0)
         
       """
       START
        p = 0
        n = 0
        x=0
        y=0
        for n in range(len(names[0])):
            for p in range(len(liste2)):
                couple = liste2[p]
                resultmz = data["%s" %n]["oround"].isin([couple[0]])
                listemz = list(resultmz[resultmz == True].index)
                resultrt = data["%s" %n]["pround"].isin([couple[1]])
                listert = list(resultrt[resultrt == True].index)
                if len(listemz)>0 and len(listert)>0:
                    for x in range(len(listemz)) :
                        for y in range(len(listert)) :
                            if listemz[x] == listert[y]:
                                for k in range(7):
                                    merged.iloc[p,7*n+2+k] = data["%s" %n].iloc[int(listemz[x]),k]
        END
        """          
        csvmerged = results.to_csv('merged.csv')
        
        print(csvmerged)             
        return merged

我知道如何在Python中启动单线程,但不知道如何“收集”结果

并行化这个循环最简单的方法是什么

for n in range(len(names[0])):

建议后的代码:

def merge_file_CCS(names):
        n = 0
        while n < len(names[0]):
            _, filename["%s" %n] = os.path.split(names[0][n])
            temp_data["%s" %n] = pandas.read_csv(open(names[0][n]), sep=',|;')
        
            data["%s" %n] = pandas.DataFrame(temp_data["%s" %n])
            data["%s" %n].insert(1, "oround", 1)
            data["%s" %n].insert(3, "pround", 1)
            
            for i in range(len(data["%s" %n].mz)):
                data["%s" %n].iloc[i,1]=round(data["%s" %n].iloc[i,0],2)
                data["%s" %n].iloc[i,3]=round(data["%s" %n].iloc[i,2],1)
            
            liste = list(zip(data["%s" %n]["oround"], data["%s" %n]["pround"]))
            listeall.extend(liste)
            print("CSV read in --- %s seconds ---" % (time.time() - start_time))
            n = n+1
            
        print("All CSV read in --- %s seconds ---" % (time.time() - start_time))

        liste2 = list(set([u for u in listeall]))
        
        merged = pandas.DataFrame(liste2)
        colonnes = ["o", "oround", "p", "pround", "td", "u", "counts"]
        m = 0
        j = 0
        for j in range(len(names[0])):
                name_data = os.path.basename(names[0][j])
                for m in range(len(colonnes)):
                    colm = colonnes[m] + name_data
                    merged.insert(len(colonnes)*j+2+m, colm, 0)
        
        pools = Pool(num_pool)
        results = []
        for result in pools.imap(pool_merge, names):
            results.append(result)
        pools.close()
        pools.join()
                    
        csvmerged = merged.to_csv('merged.csv')
        
        print(csvmerged)             
        return merged

def pool_merge(n):
        p = 0
        x = 0
        y = 0
        for p in range(len(liste2)):
            couple = liste2[p]
            resultmz = data["%s" %n]["oround"].isin([couple[0]])
            listemz = list(resultmz[resultmz == True].index)
            resultrt = data["%s" %n]["pround"].isin([couple[1]])
            listert = list(resultrt[resultrt == True].index)
            if len(listemz)>0 and len(listert)>0:
                for x in range(len(listemz)) :
                    for y in range(len(listert)) :
                        if listemz[x] == listert[y]:
                            for k in range(7):
                                merged.iloc[p,7*n+2+k] = data["%s" %n].iloc[int(listemz[x]),k]

但它给了我错误

AttributeError: Can't get attribute 'pool_merge' on <module '__main__' (built-in)>

Tags: infordatalennamesrangemergedlist
1条回答
网友
1楼 · 发布于 2024-09-29 23:31:53

您可以使用Pool

from multiprocessing import Pool, cpu_count


num_pool = cpu_count()//2
pools = Pool(num_pool)
results = []
for result in pools.imap(merge_csv, names):
    results.append(result)
pools.close()
pools.join()

names是传递给函数的名称列表merge_csv。您可以使用pd.concat(results)来确定最终结果

https://docs.python.org/3/library/multiprocessing.html

相关问题 更多 >

    热门问题