经济大幅放缓字符串.计数（）读取具有特定字符的特定数据时

import os import random as rd import string import time # Function to create random data in a specific pattern with separator ";": def createRandomString(num,io,fullLength): lineFull = '' nl = True randstr = ''.join(rd.choice(string.ascii_letters) for _ in range(7)) for i in range(num): if i == 0: line = 'Start;' else: line = '' bb = rd.choice([True,True,False]) if bb: line = line+'\"\";' else: if rd.random() < 0.999: line = line+randstr else: line = line+rd.randint(10,100)*randstr if nl and i != num-1: line = line+';\n' nl = False elif rd.random() < 0.04 and i != num-1: line = line+';\n' if rd.random() < 0.01: add = rd.randint(1,10)*'\n' line = line+add else: line = line+';' lineFull = lineFull+line return lineFull+'\n' # Create file with random data: outputFolder = "C:\\DataDir\\Output\\" numberOfCols = 38 fullLength = 10000 testLines = [createRandomString(numberOfCols,i,fullLength) for i in range(fullLength)] with open(outputFolder+"TestFile.txt",'w') as tf: tf.writelines(testLines) # Read in file: with open(outputFolder+"TestFile.txt",'r') as ff: lines = [] for line in ff.readlines(): lines.append(unicode(line.rstrip('\n'))) # Restore columns by counting the separator: linesT = '' lines2 = [] time0 = time.time() for i in range(len(lines)): linesT = linesT + lines[i] count = linesT.count(';') if count == numberOfCols: lines2.append(linesT) linesT = '' if i%1000 == 0: print time.time()-time0 time0 = time.time() print time.time()-time0

0.0 0.0019998550415 0.00100016593933 0.000999927520752 0.000999927520752 0.000999927520752 0.000999927520752 0.00100016593933 0.0019998550415 0.000999927520752 0.00100016593933 0.0019998550415 0.00100016593933 0.000999927520752 0.00200009346008 0.000999927520752 0.000999927520752 0.00200009346008 0.000999927520752 0.000999927520752 0.00200009346008 0.000999927520752 0.00100016593933 0.000999927520752 0.00200009346008 0.000999927520752

0.0 0.0759999752045 0.273000001907 0.519999980927 0.716000080109 0.919999837875 1.11500000954 1.25199985504 1.51200008392 1.72199988365 1.8820002079 2.07999992371 2.21499991417 2.37400007248 2.64800000191 2.81900000572 3.04500007629 3.20299983025 3.55500006676 3.6930000782 3.79499983788 4.13900017738 4.19899988174 4.58700013161 4.81799983978 4.92000007629 5.2009999752 5.40199995041 5.48399996758 5.70299983025 5.92300009727 6.01099991798 6.44200015068 6.58999991417 3.99399995804

charSet = [' ','"','&',"'",'(',')','*','+',',','-','.','/','0','1','2','3','4','5','6', '7','8','9',':',';','<','=','>','A','B','C','D','E','F','G','H','I','J','K', 'L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','\\','_','`','a', 'b','d','e','g','h','i','l','m','n','o','r','s','t','x']

import random as rd rd.seed() def Test0(): randstr = ''.join(rd.choice(string.digits) for _ in range(10000)) randstr.count('7') def Test1(): randstr = ''.join(rd.choice(string.ascii_letters) for _ in range(10000)) randstr.count('a') def Test2(): randstr = ''.join(rd.choice(string.printable) for _ in range(10000)) randstr.count(';') def Test3(): randstr = ''.join(rd.choice(charSet) for _ in range(10000)) randstr.count(';')

%timeit(Test0()) 100 loops, best of 3: 9.27 ms per loop %timeit(Test1()) 100 loops, best of 3: 9.12 ms per loop %timeit(Test2()) 100 loops, best of 3: 9.94 ms per loop %timeit(Test3()) 100 loops, best of 3: 8.31 ms per loop

2条回答

网友

1楼 · 编辑于 2024-09-28 17:08:14

我只是报告我的发现。性能差异似乎不是来自str.count()函数。我更改了您的代码并将str.count()重构为自己的函数。我还将您的全局代码放入一个main函数中。以下是我的代码版本：

import os
import time
import random as rd
import string
import timeit

# Function to create random data in a specific pattern with separator ";":
def createRandomString(num,io,fullLength):
    lineFull = ''
    nl = True
    randstr = ''.join(rd.choice(string.ascii_letters) for _ in range(7))
    #randstr = ''.join(rd.choice(string.printable) for _ in range(7))
    for i in range(num):
        if i == 0:
            line = 'Start;'
        else:
            line = ''
            bb = rd.choice([True,True,False])
            if bb:
                line = line+'\"\";'
            else:
                if rd.random() < 0.999:
                    line = line+randstr
                else:
                    line = line+rd.randint(10,100)*randstr
                if nl and i != num-1:
                    line = line+';\n'
                    nl = False
                elif rd.random() < 0.04 and i != num-1:
                    line = line+';\n'
                    if rd.random() < 0.01:
                        add = rd.randint(1,10)*'\n'
                        line = line+add
                else:
                    line = line+';'
        lineFull = lineFull+line
    return lineFull+'\n'


def counting_func(lines_iter):
    try:
        return lines_iter.next().count(';')
    except StopIteration:
        return -1


def wrapper(func, *args, **kwargs):
    def wrapped():
        return func(*args, **kwargs)
    return wrapped


# Create file with random data:
def main():
    fullLength = 100000
    outputFolder = ""
    numberOfCols = 38
    testLines = [createRandomString(numberOfCols,i,fullLength) for i in range(fullLength)]
    with open(outputFolder+"TestFile.txt",'w') as tf:
        tf.writelines(testLines)

    # Read in file:
    with open(outputFolder+"TestFile.txt",'r') as ff:
        lines = []
        for line in ff.readlines():
            lines.append(unicode(line.rstrip('\n')))

    # Restore columns by counting the separator:
    lines_iter = iter(lines)
    print timeit.timeit(wrapper(counting_func, lines_iter), number=fullLength)


if __name__ == '__main__': main()

每生成一行测试100000次。对于string.ascii_letters，我从timeit得到每个循环的平均0.0454177856445秒。用string.printable，我平均得到0.0426299571991。事实上，后者比前者稍快一些，尽管差别不大。你知道吗

我怀疑性能的差异来自于除了计数之外，您在以下循环中所做的工作：

for i in range(len(lines)):
    linesT = linesT + lines[i]
    count = linesT.count(';')
    if count == numberOfCols:
        lines2.append(linesT)
        linesT = ''
    if i%1000 == 0:
        print time.time()-time0
        time0 = time.time()

另一种可能是在没有主函数的情况下访问全局变量的速度变慢。但这两种情况都应该发生，所以不是真的。你知道吗

网友

2楼 · 编辑于 2024-09-28 17:08:14

问题出在count(';')。你知道吗

string.printable包含';'，而string.ascii_characters不包含

然后随着linesT长度的增长，执行时间也随之增长：

0.000236988067627
0.0460968017578
0.145275115967
0.271568059921
0.435608148575
0.575787067413
0.750104904175
0.899538993835
1.08505797386
1.24447107315
1.34459710121
1.45430088043
1.63317894936
1.90502595901
1.92841100693
2.07722711563
2.16924905777
2.30753016472

尤其是这段代码在string.printable方面有问题：

 numberOfCols = 38
 if count == numberOfCols:
        lines2.append(linesT)
        linesT = ''

因为在linesT被刷新之前，';'有可能在第37行中被多次包含，所以38将被跳过，linesT将无限期地增长。你知道吗

您可以通过将初始设置保留为string.ascii_characters并将代码更改为count('a')来观察这种行为。你知道吗

要解决printable的问题，您可以如下修改代码：

if count > numberOfCols:

然后我们回到预期的运行时行为：

0.000234842300415
0.00233697891235
0.00247097015381
0.00217199325562
0.00262403488159
0.00262403488159
0.0023078918457
0.0024049282074
0.00231409072876
0.00233006477356
0.00214791297913
0.0028760433197
0.00241804122925
0.00250506401062
0.00254893302917
0.00266218185425
0.00236296653748
0.00201988220215
0.00245118141174
0.00206398963928
0.00219988822937
0.00230193138123
0.00205302238464
0.00230097770691
0.00248003005981
0.00204801559448

相关问题更多 >

编程相关推荐

热门问题

热门文章