为什么在某些情况下,使用python的pp模块进行并行计算比较慢?

2024-09-21 05:33:04 发布

您现在位置:Python中文网/ 问答频道 /正文

看看代码。当genmotif的参数设置为n_seq=5000和n_pos=10时,并行版本getPairedSeqNames3和getPairedSeqNames1要慢得多。但当n_seq=50且n_pos=2000时,并行版本的性能更好。不幸的是,我处理的数据更像是n_seq=5000和n_pos=10。有人能告诉我为什么会这样吗?当n_seq=5000且n_pos=10时,有没有办法使并行版本的性能更好?在

代码如下:

#! /usr/bin/env python
import pp, sys, random, time
def getMotif_SeqName(Motifs):
    return dict([(uid, set(Motifs[uid].keys())) for uid in Motifs.keys()])

def getPairedList(uids):
    return [(id1, id2) for i, id1 in enumerate(uids) for id2 in uids[i:] if id1 != id2]

def is_overlap(pos_pair):
    (posA, posB) = pos_pair
    if max(posA) < min(posB) or min(posA) > max(posB):
        return False
    else:
        return True

def caclDist(pos_pair):
    (posA, posB) = pos_pair
    d1 = min(posB) - max(posA)
    d2 = min(posA) - max(posB)
    return {True: d1, False: -d2}[d1 > d2]

def getDist(posA, posB, low, high):
    comb = [(i, j) for i in posA for j in posB]
    not_overlap = [e for e in comb if not is_overlap(e)]
    distances = map(caclDist, not_overlap)
    CoDist = {}
    for i, d in enumerate(distances):
        if abs(d) >= low and abs(d) <= high:
            CoDist[not_overlap[i]] = d
    return CoDist

def getDist2(uidA, uidB, seqname, posA, posB, low, high):
    comb = [(i, j) for i in posA for j in posB]
    not_overlap = [e for e in comb if not is_overlap(e)]
    distances = map(caclDist, not_overlap)
    CoDist = {}
    for i, d in enumerate(distances):
        if abs(d) >= low and abs(d) <= high:
            CoDist[not_overlap[i]] = d
    return (uidA, uidB, seqname, CoDist)

def ppCacl(job_server, inputs, equation, funs, packages, Progress=True):
    num_inputs = len(inputs) / 100 + 1
    jobs = [job_server.submit(equation, pars, funs, packages) for pars in inputs]
    return [job() for job in jobs]


def ssCacl(inputs, equation):
    ps = []
    for i, (X, n, m, N) in enumerate(inputs):
        ps.append(equation(X, n, m, N))
    return ps

def getPairedSeqNames1(Motifs):
    SeqNames = getMotif_SeqName(Motifs)
    MotifPairs = set(getPairedList(Motifs.keys()))
    num_MotifPairs = len(MotifPairs)
    print "%s pairs to go" % num_MotifPairs
    num_MotifPairs = num_MotifPairs / 100 + 1
    PairedMotifs = {}

    for i, (uidA, uidB) in enumerate(MotifPairs):
        intersect = list(SeqNames[uidA] & SeqNames[uidB])
        if intersect:
            PosA = Motifs[uidA]
            PosB = Motifs[uidB]

            sys.stderr.write("Progress:%d%%\t%s\t%s\r" % (i / num_MotifPairs, uidA, uidB))
            positions = [(PosA[seqname], PosB[seqname], 10, 250) for seqname in intersect]
            distances = ppCacl(job_server, positions, getDist, (is_overlap, caclDist), (), False)
            distances = dict([(intersect[i], d) for i, d in enumerate(distances) if d])
            if distances:
                PairedMotifs[(uidA, uidB)] = distances
    return PairedMotifs

def getPairedSeqNames2(Motifs):
    SeqNames = getMotif_SeqName(Motifs)
    MotifPairs = set(getPairedList(Motifs.keys()))
    num_MotifPairs = len(MotifPairs)
    print "%s pairs to go" % num_MotifPairs
    num_MotifPairs = num_MotifPairs / 100 + 1
    PairedMotifs = {}

    for i, (uidA, uidB) in enumerate(MotifPairs):
        intersect = list(SeqNames[uidA] & SeqNames[uidB])
        if intersect:
            PosA = Motifs[uidA]
            PosB = Motifs[uidB]

            sys.stderr.write("Progress:%d%%\t%s\t%s\r" % (i / num_MotifPairs, uidA, uidB))
            positions = [(PosA[seqname], PosB[seqname], 10, 250) for seqname in intersect]
            distances = ssCacl(positions, getDist)
            distances = dict([(intersect[i], d) for i, d in enumerate(distances) if d])
            if distances:
                PairedMotifs[(uidA, uidB)] = distances
    return PairedMotifs

def getPairedSeqNames3(Motifs):
    SeqNames = getMotif_SeqName(Motifs)
    MotifPairs = set(getPairedList(Motifs.keys()))
    num_MotifPairs = len(MotifPairs)
    print "%s pairs to go" % num_MotifPairs
    num_MotifPairs = num_MotifPairs / 100 + 1
    PairedMotifs = {}
    positions = []

    for i, (uidA, uidB) in enumerate(MotifPairs):
        intersect = list(SeqNames[uidA] & SeqNames[uidB])
        if intersect:
            PosA = Motifs[uidA]
            PosB = Motifs[uidB]

            sys.stderr.write("Progress:%d%%\t%s\t%s\r" % (i / num_MotifPairs, uidA, uidB))
            positions.extend([(uidA, uidB, seqname, PosA[seqname], PosB[seqname], 10, 250) for seqname in intersect])

    distances = ppCacl(job_server, positions, getDist2, (is_overlap, caclDist), (), False)
    for (uidA, uidB, seqname, CoDist) in distances:
        if CoDist:
            if not PairedMotifs.has_key((uidA, uidB)):
                PairedMotifs[(uidA, uidB)] = {}
            PairedMotifs[(uidA, uidB)][seqname] = CoDist
    return PairedMotifs


def genMotifs(n_seq=5000, n_pos=10):
    digits = range(1, 60000)
    Motifs = {}
    uids = random.sample(digits, 50)
    for uid in uids:
        seqnames = random.sample(digits, random.randint(0, n_seq))
        Motifs[uid] = {}
        for seqname in seqnames:
            Motifs[uid][seqname] = genPos(random.randint(0, n_pos))
    return Motifs


def genPos(n):
    return [(random.randint(0, 3000),random.randint(0, 3000)) for i in xrange(0,n)]


job_server = pp.Server()

Motifs = genMotifs()
timestamp = time.time()
getPairedSeqNames1(Motifs)
print time.time() - timestamp
timestamp = time.time()
getPairedSeqNames2(Motifs)
print time.time() - timestamp
timestamp = time.time()
getPairedSeqNames3(Motifs)
print time.time() - timestamp


Motifs = genMotifs(50, 2000)
timestamp = time.time()
getPairedSeqNames1(Motifs)
print time.time() - timestamp
timestamp = time.time()
getPairedSeqNames2(Motifs)
print time.time() - timestamp
timestamp = time.time()
getPairedSeqNames3(Motifs)
print time.time() - timestamp

我电脑上的结果:

^{pr2}$

getPairedSeqNames3的cProfile n_seq=5000 n_pos=10 enter image description here

getPairedSeqNames3 n_seq=10 n_pos=5000的cProfile enter image description here

getPairedSeqNames3 n_seq=20 n_pos=2500的cProfile enter image description here


Tags: inposforreturniftimedefnum
1条回答
网友
1楼 · 发布于 2024-09-21 05:33:04

我更改了您的代码以使用更好的python习惯用法:

#! /usr/bin/env python
import pp
import sys
import random
import time
from collections import defaultdict

job_server = pp.Server()


def getMotif_SeqName(Motifs):
    return {uid: set(d.keys()) for uid, d in Motifs.items()}


def getPairedList(uids):
    return [(id1, id2) for i, id1 in enumerate(uids) for id2 in uids[i:] if id1 != id2]


def is_overlap(pos_pair):
    (posA, posB) = pos_pair
    return not (max(posA) < min(posB) or min(posA) > max(posB))


def caclDist(pos_pair):
    (posA, posB) = pos_pair
    d1 = min(posB) - max(posA)
    d2 = min(posA) - max(posB)
    return d1 if d1 > d2 else -d2


def getDist(posA, posB, low, high):
    comb = ((i, j) for i in posA for j in posB)
    not_overlap = [e for e in comb if not is_overlap(e)]
    distances = map(caclDist, not_overlap)
    return {
        not_over: d
        for not_over, d in zip(not_overlap, distances)
        if low <= abs(d) <= high
    }


def getDist2(uidA, uidB, seqname, posA, posB, low, high):
    return (uidA, uidB, seqname, getDist(posA, posB, low, high))


def ppCacl(job_server, inputs, equation, funs, packages, Progress=True):
    jobs = (job_server.submit(equation, pars, funs, packages) for pars in inputs)
    return [job() for job in jobs]


def ssCacl(inputs, equation):
    return [equation(X, n, m, N) for (X, n, m, N) in inputs]


def getPairedSeqNames1(Motifs, SeqNames, MotifPairs):
    num_MotifPairs = len(MotifPairs)
    print "%s pairs to go" % num_MotifPairs
    num_MotifPairs = num_MotifPairs / 100 + 1
    PairedMotifs = {}

    for i, (uidA, uidB) in enumerate(MotifPairs):
        intersect = list(SeqNames[uidA] & SeqNames[uidB])
        if intersect:
            PosA = Motifs[uidA]
            PosB = Motifs[uidB]

            sys.stderr.write("Progress:%d%%\t%s\t%s\r" % (i / num_MotifPairs, uidA, uidB))
            positions = [(PosA[seqname], PosB[seqname], 10, 250) for seqname in intersect]
            distances = ppCacl(job_server, positions, getDist, (is_overlap, caclDist), (), False)
            distances = {index: d for index, d in zip(intersect, distances) if d}
            if distances:
                PairedMotifs[(uidA, uidB)] = distances
    return PairedMotifs


def getPairedSeqNames2(Motifs, SeqNames, MotifPairs):
    num_MotifPairs = len(MotifPairs)
    print "%s pairs to go" % num_MotifPairs
    num_MotifPairs = num_MotifPairs / 100 + 1
    PairedMotifs = {}

    for i, (uidA, uidB) in enumerate(MotifPairs):
        intersect = list(SeqNames[uidA] & SeqNames[uidB])
        if intersect:
            PosA = Motifs[uidA]
            PosB = Motifs[uidB]

            sys.stderr.write("Progress:%d%%\t%s\t%s\r" % (i / num_MotifPairs, uidA, uidB))
            positions = ((PosA[seqname], PosB[seqname], 10, 250) for seqname in intersect)
            distances = ssCacl(positions, getDist)
            distances = {index: d for index, d in zip(intersect, distances) if d}
            if distances:
                PairedMotifs[(uidA, uidB)] = distances
    return PairedMotifs


def getPairedSeqNames3(Motifs, SeqNames, MotifPairs):
    num_MotifPairs = len(MotifPairs)
    print "%s pairs to go" % num_MotifPairs
    num_MotifPairs = num_MotifPairs / 100 + 1
    PairedMotifs = defaultdict(dict)
    positions = []

    for i, (uidA, uidB) in enumerate(MotifPairs):
        intersect = list(SeqNames[uidA] & SeqNames[uidB])
        if intersect:
            PosA = Motifs[uidA]
            PosB = Motifs[uidB]

            sys.stderr.write("Progress:%d%%\t%s\t%s\r" % (i / num_MotifPairs, uidA, uidB))
            positions.extend([(uidA, uidB, seqname, PosA[seqname], PosB[seqname], 10, 250) for seqname in intersect])

    distances = ppCacl(job_server, positions, getDist2, (is_overlap, caclDist), (), False)
    for (uidA, uidB, seqname, CoDist) in distances:
        if CoDist:
            PairedMotifs[(uidA, uidB)][seqname] = CoDist
    return PairedMotifs


def genMotifs(n_seq, n_pos):
    digits = range(1, 60000)
    uids = random.sample(digits, 50)
    return {
        uid: {
            seqname: genPos(random.randint(0, n_pos))
            for seqname in random.sample(digits, random.randint(0, n_seq))
        }
        for uid in uids
    }


def genPos(n):
    return [(random.randint(0, 3000), random.randint(0, 3000)) for _ in xrange(n)]


def driver(Motifs):
    SeqNames = getMotif_SeqName(Motifs)
    MotifPairs = set(getPairedList(Motifs.keys()))
    for fn in (getPairedSeqNames1, getPairedSeqNames2, getPairedSeqNames3):
        timestamp = time.time()
        fn(Motifs, SeqNames, MotifPairs)
        print time.time() - timestamp


if __name__ == '__main__':
    for x, y in ((5000, 10), (50, 2000)):
        print '=' * 30
        driver(genMotifs(x, y))

我不能保证这会更快。如果你想优化你的代码,我会考虑使用cProfile或者使用numpy或cython进行评测。在

相关问题 更多 >

    热门问题