针对大量小文件的C与Python I/O性能比较

#include <stdio.h> #include <stdlib.h> #include <dirent.h> #include <time.h> #define LINE_SIZE 300 #define BUFFER_SZ 5000*LINE_SIZE void combine(char *fname) { DIR *d; FILE * fp; char line[LINE_SIZE]; char buffer[BUFFER_SZ]; short flagHeader = 1; buffer[0] = '\0'; // need to init buffer befroe strcat to it struct dirent *dir; chdir("runs"); d = opendir("."); if (d) { while ((dir = readdir(d)) != NULL) { if ((strstr(dir->d_name, "Hs")) && (strstr(dir->d_name, ".txt")) ) { fp = fopen (dir->d_name, "r"); fgets(line, LINE_SIZE, fp); // read first line if (flagHeader) { // append it to buffer only once strcat(buffer, line); flagHeader = 0; } fgets(line, LINE_SIZE, fp); // read second line strcat(buffer, line); fclose(fp); } } closedir(d); chdir(".."); fp = fopen(fname, "w"); fprintf(fp, buffer); fclose(fp); } } int main() { clock_t tc; int msec; tc = clock(); combine("results_c.txt"); msec = (clock() - tc) * 1000 / CLOCKS_PER_SEC; printf("elapsed time: %d.%ds\n", msec/1000, msec%1000); return 0; }

import glob from time import time def combine(wildcard, fname='results.txt'): """Concatenates all files matching a name pattern into one file. Assumes that the files have 2 lines, the first one being the header. """ files = glob.glob(wildcard) buffer = '' flagHeader = True for file in files: with open(file, 'r') as pf: lines = pf.readlines() if not len(lines) == 2: print('Error reading file %s. Skipping.' % file) continue if flagHeader: buffer += lines[0] flagHeader = False buffer += lines[1] with open(fname, 'w') as pf: pf.write(buffer) if __name__ == '__main__': et = time() combine('runs\\Hs*.txt') et = time() - et print("elapsed time: %.3fs" % et)

Run 1/10 C elapsed time: 9.530s Python elapsed time: 10.225s =================== Run 2/10 C elapsed time: 5.378s Python elapsed time: 10.613s =================== Run 3/10 C elapsed time: 6.534s Python elapsed time: 13.971s =================== Run 4/10 C elapsed time: 5.927s Python elapsed time: 14.181s =================== Run 5/10 C elapsed time: 5.981s Python elapsed time: 9.662s =================== Run 6/10 C elapsed time: 4.658s Python elapsed time: 9.757s =================== Run 7/10 C elapsed time: 10.323s Python elapsed time: 19.032s =================== Run 8/10 C elapsed time: 8.236s Python elapsed time: 18.800s =================== Run 9/10 C elapsed time: 7.580s Python elapsed time: 15.730s =================== Run 10/10 C elapsed time: 9.465s Python elapsed time: 20.532s ===================

In [2]: prun bc.combine('runs\\Hs*.txt') 64850 function calls (64847 primitive calls) in 12.205 seconds Ordered by: internal time ncalls tottime percall cumtime percall filename:lineno(function) 1899 8.391 0.004 8.417 0.004 {built-in method io.open} 1898 3.322 0.002 3.341 0.002 {method 'readlines' of '_io._IOBase' objects} 1 0.255 0.255 0.255 0.255 {built-in method nt.listdir}

1条回答

网友

1楼 · 发布于 2024-09-29 23:28:29

可以归结为以下几点：

最重要的是，Python版本使用文本模式（即r和w），这意味着处理str（UTF-8）对象，而不是{}。
有很多小文件，我们很少使用它们，Python自身的开销（例如在open中设置file对象）变得很重要。
Python必须为大多数事情动态分配内存。

另外请注意，如果使用本地文件并执行多次运行，则此测试中的I/O与此无关，因为这些文件将已缓存在内存中。唯一真正的I/O将是最终的write（即使这样，您也必须确保正在刷新/同步到磁盘）。在

现在，如果处理文本模式（即使用rb和wb），并且减少分配（在本例中不太重要，但也很明显），您会得到如下结果：

def combine():
    flagHeader = True
    with open('results-python-new.txt', 'wb') as fout:
        for filename in glob.glob('runs/Hs*.txt'):
            with open(filename, 'rb') as fin:
                header = fin.readline()
                values = fin.readline()
                if flagHeader:
                    flagHeader = False
                    fout.write(header)
                fout.write(values)

那么Python完成任务的时间已经比C版本快了一半：

^{pr2}$

您可能仍然可以稍微改进一下时间，例如，通过避免glob。在

但是，如果您对C版本也应用了一些类似的修改，那么您将获得比Python三分之一的时间更好的时间：

New C:      0.068

看看：

#define LINE_SIZE 300

void combine(void) {
    DIR *d;
    FILE *fin;
    FILE *fout;
    struct dirent *dir;
    char headers[LINE_SIZE];
    char values[LINE_SIZE];
    short flagHeader = 1;

    fout = fopen("results-c-new.txt", "wb");
    chdir("runs");
    d = opendir(".");
    if (d) {
        while ((dir = readdir(d)) != NULL) {
            if ((strstr(dir->d_name, "Hs")) && (strstr(dir->d_name, ".txt")) ) {
                fin = fopen(dir->d_name, "rb");
                fgets(headers, LINE_SIZE, fin);
                fgets(values, LINE_SIZE, fin);
                if (flagHeader) {
                    flagHeader = 0;
                    fputs(headers, fout);
                }
                fputs(values, fout);
                fclose(fin);
            }
        }
        closedir(d);
        fclose(fout);
    }
}

相关问题更多 >

编程相关推荐

热门问题

热门文章