import re
def findGaps(aSeq):
# Get and print the list of gaps present into the sequence
gaps = re.findall('[-]+', aSeq)
print('Number of gaps = {0} \n'.format(len(gaps)))
# Get and print start index, end index and length for each gap
for i,gap in enumerate(gaps,1):
startIndex = aSeq.index(gap)
endIndex = startIndex + len(gap) - 1
print('Index Position of Gap region {0} = {1} to {2}'.format(i, startIndex, endIndex))
print('Length of Gap region {0} = {1} \n'.format(i, len(gap)))
aSeq = aSeq.replace(gap,'*' * len(gap), 1)
findGaps("ATC----GCTGTA--A-----T")
>>> def get_seq_gaps(seq):
... gaps = np.array([i for i, el in enumerate(seq) if el == '-'])
... diff = np.cumsum(np.append([False], np.diff(gaps) != 1))
... un = np.unique(diff)
... yield len(un)
... for i in un:
... subseq = gaps[diff == i]
... yield i + 1, len(subseq), subseq.min(), subseq.max()
>>> def report_gaps(seq):
... gaps = get_seq_gaps(seq)
... print('Number of gaps = %s\n' % next(gaps), sep='')
... for (i, l, mn, mx) in gaps:
... print('Index Position of Gap region %s = %s to %s' % (i, mn, mx))
... print('Length of Gap Region %s = %s\n' % (i, l), sep='')
>>> seq = 'ATC----GCTGTA--A-----T'
>>> report_gaps(seq)
Number of gaps = 3
Index Position of Gap region 1 = 3 to 6
Length of Gap Region 1 = 4
Index Position of Gap region 2 = 13 to 14
Length of Gap Region 2 = 2
Index Position of Gap region 3 = 16 to 20
Length of Gap Region 3 = 5
import re
seq = 'ATC----GCTGTA--A-----T'
matches = list(re.finditer('-+', seq))
print 'Number of gaps =', len(matches)
print
for region_number, match in enumerate(matches, 1):
print 'Index Position of Gap region {} = {} to {}'.format(
region_number,
match.start(),
match.end() - 1)
print 'Length of Gap region {} = {}'.format(
region_number,
match.end() - match.start())
print
以下是我对代码的建议,非常直截了当,简短易懂,除了
re
之外没有任何其他导入包:与regex相比,这有点冗长,但您可以找到连字符的索引,并使用第一个差异对它们进行分组:
首先,这将形成一个索引数组,其中包含连字符:
^{pr2}$第一个差异不是1的地方表示中断。再加一个假以保持长度。
现在取这些组的唯一元素,将
gaps
约束到相应的索引,并找到其最小/最大值您需要的是使用正则表达式来查找间隙(一个或多个破折号,其翻译为“-+”,加号表示一个或多个):
注释
matches
是匹配对象的列表enumerate
。你可以查一下它是怎么工作的。在.start()
,它返回开始索引,.end()
返回结束索引。注意,这里的结束索引比您想要的多了一个,因此我从中减去了1。在相关问题 更多 >
编程相关推荐