import random
result, n, k = set(), 20000, 10
for _ in range(8):
a = random.randint(0, n-k)
while any(a >= i and a <= i+k for i in result):
a = random.randint(0, n-k)
result.add(a)
final_result = [(i, i+k) for i in result]
import numpy as np
from typing import List
LoS = List[str]
def string2substrings(s: str, m:int, k: int) -> LoS:
"""
split string s into m substrings with length k each
"""
l = len(s)
if l == 0:
raise RuntimeError(f"Empty input string")
if k <= 0:
raise RuntimeError(f"Bad substrung length {k}")
if m <= 0:
raise RuntimeError(f"Bad substrung count {m}")
if l < m*k:
raise RuntimeError(f"cannot split string {s} into {m} pieces with length {k} - source length {l} is too short")
if l == m*k: # trivial case of all zero intervals
rv = [s[t*k:t*k+k] for t in range(0, m)]
return rv
# we will sample m+1 intervals which sum to l-m*k
# for that we will use multinomial distribution which
# sums to fixed value by itself
p = np.full(m+1, 1.0/np.float64(m+1), dtype=np.float64) # equal m+1 probabilities
intervals = np.random.multinomial(n = l - m*k, pvals = p)
print((intervals, np.sum(intervals)))
rv = list()
end = 0
for count, i in enumerate(intervals):
if count == m:
break
start = end+i
end = start + k
rv.append(s[start:end])
return rv
print(string2substrings("qwertyuiopas", 3, 3))
可以使用
while
循环来强制唯一性:输出:
您可以简单地运行一个循环,在循环内部使用
random
包来选择一个起始索引,并提取从该索引开始的子字符串。跟踪已使用的起始索引,以便检查每个子字符串是否不重叠。只要k
不是太大,就应该快速而容易地工作。你知道吗我之所以提到
k
的大小,是因为如果它足够大,那么可以选择不允许您找到8个不重叠的子字符串。但只有当k
相对于原始字符串的长度相当大时,才需要考虑这一点。你知道吗嗯,没有必要做循环。如果我们想把字符串
s
分割成m
子串,每个子串的长度k
,这个问题相当于在总长度等于len(s)-m*k的子串间隔之间采样m+1
有一个很好的离散分布,它的性质是采样值之和等于固定量Multinomial Distribution。按间隔拆分字符串是非常简单的,可能会得到优化
代码,Python 3.7.5 Anaconda x64,Win10。你知道吗
相关问题 更多 >
编程相关推荐