python脚本只在子目录中运行,而不在b中运行

2024-09-29 17:23:37 发布

您现在位置:Python中文网/ 问答频道 /正文

我的python脚本使用os.步行循环子目录。我正在从当前工作目录运行脚本,希望脚本只在当前工作目录及其子目录中工作。但是,脚本返回到根目录并从那里查找所有文件。 例如,目录结构是: 文件夹1 -子1 文件夹2 -子2 -子3

如果我在folder1中运行脚本,我只需要folder1和sub1中的文件,但是我的脚本也使用folder2、sub2和sub3中的文件。 我的剧本很大,但希望有人能给我一个简短的答案如何改变所有的剧本os.步行防止python进入其他文件夹的行。你知道吗

#next step
print('Start merging contig files')

for root, dirs, files in os.walk(os.getcwd()):
    filepath = os.path.join(root, 'genes.faa.genespercontig.csv')
    if os.path.isfile(filepath):
        with open(filepath, 'r') as f1:
            df1 = pd.read_csv(f1, header=None, delim_whitespace=True, names = ["contig", "genes"])
            df1['genome'] = os.path.basename(os.path.dirname(filepath))
    else:
        continue

    filepath = os.path.join(root, 'hmmer.analyze.txt.results.txt')
    if os.path.isfile(filepath):
        with open(filepath, 'r') as f2:
            df2 = pd.read_csv(f2, header=None, delim_whitespace=True, names = ["contig", "SCM"])
            df2['genome'] = os.path.basename(os.path.dirname(filepath))
    else:
        continue

    filepath = os.path.join(root, 'genes.fna.output_blastplasmiddb.out.count_plasmiddbhit.out')
    if os.path.isfile(filepath):
        with open(filepath, 'r') as f3:
            df3 = pd.read_csv(f3, header=None, delim_whitespace=True, names = ["contig", "plasmid_genes"])
            df3['genome'] = os.path.basename(os.path.dirname(filepath))
    else:
        continue

    #merge dataframes
    dfmerge1 = pd.merge(df1, df2, on=['genome', 'contig'], how='outer')
    df_end = pd.merge(dfmerge1, df3, on=['genome', 'contig'], how='outer')

    #set NaN in columns to 0
    nan_cols = df_end.columns[df_end.isnull().any(axis=0)]
    for col in nan_cols:
        df_end[col] = df_end[col].fillna(0).astype(int)

    #add column with genes/SCM en round to 2 decimals
    df_end['SCM/genes'] = df_end['SCM']/df_end['genes']
    df_end['SCM/genes'] = df_end['SCM/genes'].round(2)

    #add column with genes/plasmid_genes en round to 2 decimals
    df_end['plasmid_genes/genes'] = df_end['plasmid_genes']/df_end['genes']
    df_end['plasmid_genes/genes'] = df_end['plasmid_genes/genes'].round(2)      

    df_end.to_csv(os.path.join(root,'outputgenesdf.csv'))
print('extra columns;done')

#next step
#CURRENT DIRECTORY 
cd = os.path.dirname(os.getcwd())

# concatenate csv files
dfList = []

for root, dirs, files in os.walk(cd):
    for fname in files:
        if re.match("outputgenesdf.csv", fname):
            frame = pd.read_csv(os.path.join(root, fname))
            dfList.append(frame)    

df = pd.concat(dfList)

Tags: csvpathin脚本dfosscmroot
1条回答
网友
1楼 · 发布于 2024-09-29 17:23:37

依据:

[...] and want that the script only works in the current working directory and its subdirectories.[...]

你可以试试这个:

def next_file(directory=os.getcwd(), max_depth=0, depth=0):

    if max_depth < 0 or depth <= max_depth:

        for name in os.listdir(directory):
            with_path = os.path.join(directory, name)

            if os.path.isfile(with_path):
                yield with_path
            else:
                for a_file in next_file(directory=with_path, max_depth=max_depth, depth=depth+1):
                    yield a_file

并使用以下方法处理文件:

for a_file in next_file(max_depth=1):
    print 'processing file: %s' % a_file
    # do your stuff here

使用max_depth控制要处理的嵌套目录的数量。0读取当前目录中的文件,-1处理所有目录。(如os.walk)。你知道吗

编辑

我对文件迭代方法做了一个小的修改。你知道吗

以下是您的脚本的完整(未经测试)版本:

def next_file(current_dir=os.getcwd(), max_depth=0, depth=0):

    if max_depth < 0 or depth <= max_depth:

        for name in os.listdir(current_dir):
            with_path = os.path.join(current_dir, name)

            if os.path.isfile(with_path):
                yield current_dir, name
            else:
                for directory, name in next_file(current_dir=with_path, max_depth=max_depth, depth=depth+1):
                    yield directory, name


for directory, name in next_file(max_depth=1):
    print 'file: %s' % name

print('Start merging contig files')

## for root, dirs, files in os.walk(os.getcwd()):
for directory, name in next_file(max_depth=1):

    ## filepath = os.path.join(root, 'genes.faa.genespercontig.csv')
    filepath = os.path.join(directory, name)

    ## if os.path.isfile(filepath):
    if name == 'genes.faa.genespercontig.csv':
        ## with open(filepath, 'r') as f1:
        with open(filepath, 'r')
            df1 = pd.read_csv(f1, header=None, delim_whitespace=True, names = ["contig", "genes"])
            ## df1['genome'] = os.path.basename(os.path.dirname(filepath))
            df1['genome'] = filepath
    ## else:  # Not necessary
    ##     continue

    ## filepath = os.path.join(root, 'hmmer.analyze.txt.results.txt')
    ## if os.path.isfile(filepath):
    if name == 'hmmer.analyze.txt.results.txt':
        with open(filepath, 'r') as f2:
            df2 = pd.read_csv(f2, header=None, delim_whitespace=True, names = ["contig", "SCM"])
            ## df2['genome'] = os.path.basename(os.path.dirname(filepath))
            df2['genome'] = filepath
    ## else:
    ##     continue

    ## filepath = os.path.join(root, 'genes.fna.output_blastplasmiddb.out.count_plasmiddbhit.out')
    ## if os.path.isfile(filepath):
    if name == 'genes.fna.output_blastplasmiddb.out.count_plasmiddbhit.out':
        with open(filepath, 'r') as f3:
            df3 = pd.read_csv(f3, header=None, delim_whitespace=True, names = ["contig", "plasmid_genes"])
            ## df3['genome'] = os.path.basename(os.path.dirname(filepath))
            df3['genome'] = filepath
        ## else:
        ##     continue

#merge dataframes
dfmerge1 = pd.merge(df1, df2, on=['genome', 'contig'], how='outer')
df_end = pd.merge(dfmerge1, df3, on=['genome', 'contig'], how='outer')

#set NaN in columns to 0
nan_cols = df_end.columns[df_end.isnull().any(axis=0)]
for col in nan_cols:
    df_end[col] = df_end[col].fillna(0).astype(int)

#add column with genes/SCM en round to 2 decimals
df_end['SCM/genes'] = df_end['SCM']/df_end['genes']
df_end['SCM/genes'] = df_end['SCM/genes'].round(2)

#add column with genes/plasmid_genes en round to 2 decimals
df_end['plasmid_genes/genes'] = df_end['plasmid_genes']/df_end['genes']
df_end['plasmid_genes/genes'] = df_end['plasmid_genes/genes'].round(2)      

#CURRENT DIRECTORY 
cd = os.path.dirname(os.getcwd())
df_end.to_csv(os.path.join(cd,'outputgenesdf.csv'))
print('extra columns;done')

#next step
# concatenate csv files
dfList = []

## I'm not sure what you want to achieve with this:
for root, dirs, files in os.walk(cd):
    for fname in files:
        if re.match("outputgenesdf.csv", fname):
            frame = pd.read_csv(os.path.join(root, fname))
            dfList.append(frame)    

df = pd.concat(dfList)

请注意,您的文件处理逻辑未经测试(我测试了文件迭代方法)。我对pandas并不熟悉,但是在每个数据帧中,您都设置了一个gnome属性df1['genome'],它保存文件的路径。我不确定你是不是想要这样。你知道吗

最后,我不明白您的最后一步是如何将outputgenesdf.csv文件合并到单个数据帧中的。你知道吗

希望这有帮助。你知道吗

相关问题 更多 >

    热门问题