看看代码。当genmotif的参数设置为n_seq=5000和n_pos=10时,并行版本getPairedSeqNames3和getPairedSeqNames1要慢得多。但当n_seq=50且n_pos=2000时,并行版本的性能更好。不幸的是,我处理的数据更像是n_seq=5000和n_pos=10。有人能告诉我为什么会这样吗?当n_seq=5000且n_pos=10时,有没有办法使并行版本的性能更好?在
代码如下:
#! /usr/bin/env python
import pp, sys, random, time
def getMotif_SeqName(Motifs):
return dict([(uid, set(Motifs[uid].keys())) for uid in Motifs.keys()])
def getPairedList(uids):
return [(id1, id2) for i, id1 in enumerate(uids) for id2 in uids[i:] if id1 != id2]
def is_overlap(pos_pair):
(posA, posB) = pos_pair
if max(posA) < min(posB) or min(posA) > max(posB):
return False
else:
return True
def caclDist(pos_pair):
(posA, posB) = pos_pair
d1 = min(posB) - max(posA)
d2 = min(posA) - max(posB)
return {True: d1, False: -d2}[d1 > d2]
def getDist(posA, posB, low, high):
comb = [(i, j) for i in posA for j in posB]
not_overlap = [e for e in comb if not is_overlap(e)]
distances = map(caclDist, not_overlap)
CoDist = {}
for i, d in enumerate(distances):
if abs(d) >= low and abs(d) <= high:
CoDist[not_overlap[i]] = d
return CoDist
def getDist2(uidA, uidB, seqname, posA, posB, low, high):
comb = [(i, j) for i in posA for j in posB]
not_overlap = [e for e in comb if not is_overlap(e)]
distances = map(caclDist, not_overlap)
CoDist = {}
for i, d in enumerate(distances):
if abs(d) >= low and abs(d) <= high:
CoDist[not_overlap[i]] = d
return (uidA, uidB, seqname, CoDist)
def ppCacl(job_server, inputs, equation, funs, packages, Progress=True):
num_inputs = len(inputs) / 100 + 1
jobs = [job_server.submit(equation, pars, funs, packages) for pars in inputs]
return [job() for job in jobs]
def ssCacl(inputs, equation):
ps = []
for i, (X, n, m, N) in enumerate(inputs):
ps.append(equation(X, n, m, N))
return ps
def getPairedSeqNames1(Motifs):
SeqNames = getMotif_SeqName(Motifs)
MotifPairs = set(getPairedList(Motifs.keys()))
num_MotifPairs = len(MotifPairs)
print "%s pairs to go" % num_MotifPairs
num_MotifPairs = num_MotifPairs / 100 + 1
PairedMotifs = {}
for i, (uidA, uidB) in enumerate(MotifPairs):
intersect = list(SeqNames[uidA] & SeqNames[uidB])
if intersect:
PosA = Motifs[uidA]
PosB = Motifs[uidB]
sys.stderr.write("Progress:%d%%\t%s\t%s\r" % (i / num_MotifPairs, uidA, uidB))
positions = [(PosA[seqname], PosB[seqname], 10, 250) for seqname in intersect]
distances = ppCacl(job_server, positions, getDist, (is_overlap, caclDist), (), False)
distances = dict([(intersect[i], d) for i, d in enumerate(distances) if d])
if distances:
PairedMotifs[(uidA, uidB)] = distances
return PairedMotifs
def getPairedSeqNames2(Motifs):
SeqNames = getMotif_SeqName(Motifs)
MotifPairs = set(getPairedList(Motifs.keys()))
num_MotifPairs = len(MotifPairs)
print "%s pairs to go" % num_MotifPairs
num_MotifPairs = num_MotifPairs / 100 + 1
PairedMotifs = {}
for i, (uidA, uidB) in enumerate(MotifPairs):
intersect = list(SeqNames[uidA] & SeqNames[uidB])
if intersect:
PosA = Motifs[uidA]
PosB = Motifs[uidB]
sys.stderr.write("Progress:%d%%\t%s\t%s\r" % (i / num_MotifPairs, uidA, uidB))
positions = [(PosA[seqname], PosB[seqname], 10, 250) for seqname in intersect]
distances = ssCacl(positions, getDist)
distances = dict([(intersect[i], d) for i, d in enumerate(distances) if d])
if distances:
PairedMotifs[(uidA, uidB)] = distances
return PairedMotifs
def getPairedSeqNames3(Motifs):
SeqNames = getMotif_SeqName(Motifs)
MotifPairs = set(getPairedList(Motifs.keys()))
num_MotifPairs = len(MotifPairs)
print "%s pairs to go" % num_MotifPairs
num_MotifPairs = num_MotifPairs / 100 + 1
PairedMotifs = {}
positions = []
for i, (uidA, uidB) in enumerate(MotifPairs):
intersect = list(SeqNames[uidA] & SeqNames[uidB])
if intersect:
PosA = Motifs[uidA]
PosB = Motifs[uidB]
sys.stderr.write("Progress:%d%%\t%s\t%s\r" % (i / num_MotifPairs, uidA, uidB))
positions.extend([(uidA, uidB, seqname, PosA[seqname], PosB[seqname], 10, 250) for seqname in intersect])
distances = ppCacl(job_server, positions, getDist2, (is_overlap, caclDist), (), False)
for (uidA, uidB, seqname, CoDist) in distances:
if CoDist:
if not PairedMotifs.has_key((uidA, uidB)):
PairedMotifs[(uidA, uidB)] = {}
PairedMotifs[(uidA, uidB)][seqname] = CoDist
return PairedMotifs
def genMotifs(n_seq=5000, n_pos=10):
digits = range(1, 60000)
Motifs = {}
uids = random.sample(digits, 50)
for uid in uids:
seqnames = random.sample(digits, random.randint(0, n_seq))
Motifs[uid] = {}
for seqname in seqnames:
Motifs[uid][seqname] = genPos(random.randint(0, n_pos))
return Motifs
def genPos(n):
return [(random.randint(0, 3000),random.randint(0, 3000)) for i in xrange(0,n)]
job_server = pp.Server()
Motifs = genMotifs()
timestamp = time.time()
getPairedSeqNames1(Motifs)
print time.time() - timestamp
timestamp = time.time()
getPairedSeqNames2(Motifs)
print time.time() - timestamp
timestamp = time.time()
getPairedSeqNames3(Motifs)
print time.time() - timestamp
Motifs = genMotifs(50, 2000)
timestamp = time.time()
getPairedSeqNames1(Motifs)
print time.time() - timestamp
timestamp = time.time()
getPairedSeqNames2(Motifs)
print time.time() - timestamp
timestamp = time.time()
getPairedSeqNames3(Motifs)
print time.time() - timestamp
我电脑上的结果:
^{pr2}$getPairedSeqNames3的cProfile n_seq=5000 n_pos=10
getPairedSeqNames3 n_seq=10 n_pos=5000的cProfile
getPairedSeqNames3 n_seq=20 n_pos=2500的cProfile
我更改了您的代码以使用更好的python习惯用法:
我不能保证这会更快。如果你想优化你的代码,我会考虑使用cProfile或者使用numpy或cython进行评测。在
相关问题 更多 >
编程相关推荐