计算模式在字符串中连续出现的最大数量

# Opening sequence.txt and making it to a string seqfile = open(sequence, "r") seqfile = seqfile.read().replace("\n", "") # Regex for each STR pattern1 = r"AGATC" pattern2 = r"TTTTTTCT" pattern3 = r"AATG" pattern4 = r"TCTAG" pattern5 = r"GATA" pattern6 = r"TATC" pattern7 = r"GAAA" pattern8 = r"TCTG" # 3 lists to store value for the loop. Whereas outercount is the final value of each amount of STR corresponding data list outercount = [0, 0, 0, 0, 0, 0, 0, 0] innercount = [0, 0, 0, 0, 0, 0, 0, 0] secondcount = [0, 0, 0, 0, 0, 0, 0, 0] # Looping through the sequence and checking if pattern matches, if it does update secondcounter by 1 and continue... for i in seqfile: if re.match(pattern1, seqfile): secondcount[0] += 1 elif re.match(pattern2, seqfile): secondcount[1] += 1 elif re.match(pattern3, seqfile): secondcount[2] += 1 elif re.match(pattern4, seqfile): secondcount[3] += 1 elif re.match(pattern5, seqfile): secondcount[4] += 1 elif re.match(pattern6, seqfile): secondcount[5] += 1 elif re.match(pattern7, seqfile): secondcount[6] += 1 elif re.match(pattern8, seqfile): secondcount[7] += 1 # Looping through outercount and checking if certain value at innercount is less than secondcount update values. for i in outercount: if secondcount[i] > innercount[i]: #stop counting innercount[i] = secondcount[i] # Reset secondcounts value so that it doesn't continue counting if it is not consecutively secondcount[i] = 0 # Checking if innercount is greater than outercount, if it is set outercount[i] to equal innercount[i] value if innercount[i] > outercount[i]: outercount[i] = innercount[i]

2条回答

网友

1楼 · 编辑于 2024-09-26 22:49:50

以下是我的解决方案：

import re

patterns = {"AGATC": 0, "TTTTTTCT": 0, "AATG": 0, "TCTAG": 0, ...}

with open(sequence, 'rt') as file:
    rows = file.readlines()

    for row in rows:
        for pattern in patterns:
            regex = r"({0}(?:{0})+)".format(pattern) # any consecutive sequence
            results = re.findall(regex, value) # list of consecutive sequences
            if results:
                longest_sequence = sorted(results, reverse=True)[0]
                count = len(longest_sequence) / len(pattern) # count the number of ocurrences
                patterns[pattern] = max(int(count), patterns [pattern])

regex的一个例子是(AGATC(?:AGATC)+)，意思是：查找单词AGATC进行一次或多次（+）。?:是the non-capture group，因此findall只返回一个组-整个匹配

网友

2楼 · 编辑于 2024-09-26 22:49:50

假设一个模式可以在一行中出现多次，我将按照以下步骤计算所有序列中一个序列中模式的最大连续重复次数

import re

with open(sequence_file, 'rt') as f:
    rows = f.readlines()

patterns = { 
    re.compile("AGATC"): 0,
    re.compile("TCTAG"): 0,
    ... 
}

for r in rows:
    for p in patterns:
        prev_end = 0
        freq = 0
        for m in p.finditer(r):
            span = m.span()
            if span[0] != prev_end:
                patterns[p] = max(freq, patterns[p])
                freq = 0

            prev_end = span[1]
            freq += 1

        if freq:
            patterns[p] = max(freq, patterns[p])

注意：我还没有测试这段代码。所以，在使用之前，请使用已知的输入进行测试

相关问题更多 >

编程相关推荐

热门问题

热门文章