在Pandas数据帧中使用include jpg文件的最佳方法？

'Displacement', 'Engine Type', 'Width, Max w/o mirrors (in)', 'Height, Overall (in)', 'Length, Overall (in)', 'Gas Mileage', 'Drivetrain', 'Passenger Capacity', 'Passenger Doors', 'Body Style' 'unique identifier'

make = [] model = [] year = [] msrp = [] front_wheel_size = [] sae_net_hp = [] displacement = [] engine_type = [] width = [] height = [] length = [] mpg = [] drivetrain = [] passenger_capacity = [] doors = [] body_style = []

for i in car_file: make.append(i.split("_")[0]) model.append(i.split("_")[1]) year.append(i.split("_")[2]) msrp.append(i.split("_")[3]) front_wheel_size.append(i.split("_")[4]) sae_net_hp.append(i.split("_")[5]) displacement.append(i.split("_")[6]) engine_type.append(i.split("_")[7]) width.append(i.split("_")[8]) height.append(i.split("_")[9]) length.append(i.split("_")[10]) mpg.append(i.split("_")[11]) drivetrain.append(i.split("_")[12]) passenger_capacity.append(i.split("_")[13]) doors.append(i.split("_")[14]) body_style.append(i.split("_")[15]) df = pd.DataFrame([make,model,year,msrp,front_wheel_size,sae_net_hp,displacement,engine_type,width,height,length,mpg,drivetrain,passenger_capacity,doors,body_style]).T

1条回答

网友

1楼 · 发布于 2024-09-27 07:27:18

我不确定你是否真的想一次打开所有65000张图片，因为这可能会占用大量内存。我建议只将图像的路径保存在数据帧中

如果确实要打开它，请参见：How to read images into a script?

但要清理原始代码：不久前我做了类似的事情，我通过正则表达式解决了这个问题。但这可能有点过头了。但是您可以直接使用split将值放入行中，而不是构建列。下面示例中的两个想法（可能包含错误）

from pathlib import Path
import re
import pandas as pd
import matplotlib.image as mpimg
from typing import Iterable, List


FILEPARTS = [
    "make", "model", "year", "msrp", "front_wheel_size", 
    "sae_net_hp", "displacement", "engine_type",              
    "width", "height", "length", "mpg",
    "drivetrain", "passenger_capacity", 
    "doors", "body_style", "id"
]


def via_regex(path_to_folder: str) -> pd.DataFrame:
    """ Matches filenames via regex. 
    This way you would skip all files in the folder that are not
    .jpg and also don't match your pattern."""
    folder = Path(path_to_folder)
    
    # select only .jpg files
    files = folder.glob('*.jpg')
    
    matches = filename_matcher(files)
    
    # build DataFrame
    df = pd.DataFrame(m.groupdict() for m in matches)
    df["File"] = [folder / m.string for m in matches]
    df["Image"] = [mpimg.imread(f) for f in df["File"].to_numpy()]
    return df


def filename_matcher(files: Iterable) -> List:
    """Match the desired pattern to the filename, i.e. extracts the data from 
    the filename into a match object. More flexible and via regex you
    could also separate numbers from units or similar."""
    # create regex pattern that groups the parts between underscores
    pattern = "_".join(f"(?P<{name}>[^_]+)" for name in FILEPARTS)
    pattern = re.compile(pattern)
    
    # match the pattern
    matches = (pattern.match(f.name) for f in files)
    return [match for match in matches if match is not None]


def via_split(path_to_folder: str) -> pd.DataFrame:
    """ Assumes all .jpg files have the right naming."""
    folder = Path(path_to_folder)
    
    # select only .jpg files
    files = folder.glob('*.jpg')
    
    # build DataFrame
    df = pd.DataFrame(columns=FILEPARTS + ["File", "Image"], index=range(len(files)))
    for idx, f in enumerate(files):
        df.loc[idx, FILEPARTS] = f.stem.split('_')
        df.loc[idx, "File"] = f
        df.loc[idx, "Image"] = mpimg.imread(f)
    return df


if __name__ == '__main__':
    df_re = via_regex('dir')
    df_split = via_split('dir')

相关问题更多 >

编程相关推荐

热门问题

热门文章