加快二进制文件打开速度

2024-10-01 02:32:55 发布

您现在位置:Python中文网/ 问答频道 /正文

我在下面的代码中查找目录中的文件,并在转换为十六进制之前以二进制格式打开它们

现在它运行起来了,但我想让它更快,current需要4分钟来读取100k文件,但不需要多个处理器上的多线程,只是想知道有什么想法

def binary_file_reader(file_data):
    with open(file_data, 'rb') as binary_file_data:
        binary_file_data = binary_file_data.read()
        binary_data = binascii.hexlify(binary_file_data)
        binary_data = binary_data.decode("utf-8")
    return binary_data

if __name__ == "__main__":
    success_files_counted = 0
    unsuccess_files_counted = 0
    read_file_names = []
    device_directory = os.getcwd()

    for r, d, f in os.walk(device_directory):
        for file in f:
            try:
                file_data = os.path.join(r, file)
                binary_data = binary_file_reader(file_data)
                read_file_names.append("Successful: "+r+file)
                success_files_counted+=1       
            except IOError:
                read_file_names.append("Unsuccessful: "+r+file)
                unsuccess_files_counted+=1

Tags: 文件forreaddatanamesosdevicefiles
1条回答
网友
1楼 · 发布于 2024-10-01 02:32:55

Python concurrent.futures模块允许类型并行处理

  • 多线程(用于I/O绑定任务)
  • 多进程(用于CPU限制的TAK)

使用10K文件评估任务加速的结果

  • 非并行和多线程几乎同时进行
  • 多进程版本大约快2倍

代码

注意:由于issues with Windows Jupyter notebook,将多处理代码放在单独的文件中。这对于其他环境不是必需的

文件:multi_process_hexify.py(所有处理代码)

import os
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ProcessPoolExecutor
from time import time
import binascii

def all_files(directory):
    ' Generator for list of files starting with directory '
    for r, d, f in os.walk(directory):
        for name in f:
            yield os.path.join(r, name)

def create_test_files(folder_path, number_files, size):
    ' Create files with random binary data '
    # Create files folder (if doesn't exist)
    Path(folder_path).mkdir(parents=True, exist_ok=True) 

    # Create data in folder
    for i in range(number_files):
        data = os.urandom(size)
        with open(os.path.join(folder_path, f'{i}.txt'), 'wb') as f:
            f.write(data)
       
def binary_file_reader(file_path):
    with open(file_path, "r+b") as binary_file_data:
            binary_file_data = binary_file_data.read()
            binary_data = binascii.hexlify(binary_file_data)
            binary_data = binary_data.decode("utf-8")
    return binary_data

def process_file(file_path):
    try:
        binary_data = binary_file_reader(file_path)
  
        return f"Successful: {file_path}"
    
    except IOError:
        return f"Unsuccessful: {file_path}"
  
def get_final(responses):
    ' Creates the final result string to return to user '
    responses = list(responses)
    successful = sum(1 for x in responses if x[0]=='S')  # Count successful
    unsuccessful = len(responses) - successful           # Count unsuccessful
    return responses, successful, unsuccessful

def main_non_parallel(device_directory):
    ' Unthreaded processing using process_file '
    start = time()
    responses = [process_file(file_path) for file_path in all_files(device_directory)]

    result = get_final(responses)
    end = time() - start
    
    print(f"Processed main_unthreaded in {end:.4f} sec")
    return result

def main_multithreaded(device_directory):
    # https://stackoverflow.com/questions/42074501/python-concurrent-futures-processpoolexecutor-performance-of-submit-vs-map/42096963#42096963
    ' Multithreaded processing using process_file '
    start = time()
    with ThreadPoolExecutor() as executor:
        futures = executor.map(process_file, all_files(device_directory), chunksize = 1000)
    
    result = get_final(futures)

    end = time() - start

    print(f"Processed main_multithreaded in {end:.4f} sec")

    return result

def main_multiprocessing(device_directory):
    ' Multi processing using process_file '
    start = time()
    files = list(all_files(device_directory))
    with ProcessPoolExecutor() as executor:
        futures = executor.map(process_file, files, chunksize = 1000)

    result = get_final(futures)
    
    end = time() - start

    print(f"Processed main_multiprocessing in {end:.4f} sec")
    return result 

测试

文件:main.py

import os
import multi_process_hexify

if __name__ == '__main__':
    # Directory for files
    device_directory =  os.path.join(os.getcwd(), 'test_dir')
    
    # Create Data
    multi_process_hexify.create_test_files(device_directory, 100, 100)
    
    # Perform Non-Parallel Processing
    read_file_names_unthreaded, successful, unsucessful  = multi_process_hexify.main_non_parallel(device_directory)
    print(f'Successful {successful}, Unsuccessfuil {unsucessful}')
    print()
    
    # Perform Multi Threaded Processing
    read_file_names_threaded, successful, unsucessful  = multi_process_hexify.main_multithreaded(device_directory)
    print(f'Successful {successful}, Unsuccessfuil {unsucessful}')
    print()

     # Perform Multi Processes Processing
    read_file_names_multiprocessing, successful, unsucessful  = multi_process_hexify.main_multiprocessing(device_directory)
    print(f'Successful {successful}, Unsuccessfuil {unsucessful}')
    
    # Confirm all three methods produce the same result
    print(read_file_names_unthreaded == read_file_names_threaded == read_file_names_multiprocessing)

输出

Processed main_unthreaded in 2.6610 sec
Successful 10000, Unsuccessfuil 0

Processed main_multithreaded in 3.2250 sec
Successful 10000, Unsuccessfuil 0

Processed main_multiprocessing in 1.2241 sec
Successful 10000, Unsuccessfuil 0
True

相关问题 更多 >