python中的奇怪错误,需要帮助调试吗

2024-09-28 21:31:28 发布

您现在位置:Python中文网/ 问答频道 /正文

我有一个更大的脚本,它试图计算一些科学的东西。代码正在工作(开始时),但在一段时间后停止。错误不同。有时我得到Too many opened files错误,这是非常奇怪的,因为我几乎没有打开7-8个文件,我使用with open,所以它必须关闭。有时在7-8-9次迭代之后,脚本的内存就用完了,这是非常奇怪的,因为在上一次迭代中,它几乎不使用5gb,而服务器有500gb的内存。你知道吗

代码约为500行,如下所示:

import math
import multiprocessing
import os
import shutil
import subprocess
import sys
import time
import gc
import numpy as np
import pandas as pd
from scipy.optimize import minimize

from biopy import energy


class InnerCycle:
    def __init__(self, work_dir):
        self.CPU = multiprocessing.cpu_count() - 1
        self.ROSETTA_DIR = '/dlab/home/gerdos/bin/rosetta_src_2016.13.58602_bundle/main'
        self.INPUT_UNBOUND = "/dlab/data/analysis/STRUCTURES/LC8/5e0m_no_ligand.pdb"
        self.WORK_DIR = work_dir
        self.ENERGY_ATTRIBUTES = ['fa_atr', 'fa_rep', 'fa_sol', 'fa_elec', 'hbond_sr_bb', 'hbond_lr_bb', 'hbond_bb_sc', 'hbond_sc', 'rama', 'omega', 'fa_dun', 'p_aa_pp']
        self.PEPTIDE_KD = {
            # SPR meresek
            "RDGSTQTE": 1e-6,
            "EDKATQTL": 2e-6,
            "NTKMTQTP": 2e-6,
            "YNKEVQTV": 3e-6,
            "GDASTQTD": 7e-6,
            "QNKECQTE": 1.35e-6,
            # Chica
            "IDAATQTE": 4e-7,
            "SEVGTQTS": 6.3e-6,
            "ACAGTQTA": 1e-2,
            "RSTTTQTD": 5.2e-6,
            # PAK1
            "RDVATSPI": 4.2e-5,
            # Myosin 5
            # "DDKNTMTD": 8.8e-6
            # Tamas talalt
            "GDKSTQTD": 5.6e-7,
            "VDKSTQTD": 7e-8,
            "GSRGTQTE": 1.64e-6,
            "VSRGTQTE": 8e-8,
            "ICAGTQTD": 1.14e-6,
            "STTGTQCD": 12.8e-6,
            # Random peptides
            "EQKSNDAS": 1e-2,
            "GSAEIRAH": 1e-2,
            "RRQVKDNR": 1e-2,
            "IDTGITVS": 1e-2,
            "MELIWNLC": 1e-2,
            "CLDWMTMP": 1e-2,
            "LRQGLLPV": 1e-2,
            "NEAYPCPC": 1e-2,
            "NADNGTTV": 1e-2,
            "QALSQHVW": 1e-2,
            "VHTGCKDF": 1e-2,
            "PGVAPTLV": 1e-2,
            "PSIPISRQ": 1e-2,
            "PPGPEKVP": 1e-2,
            "KDFINFIS": 1e-2,
            "EILMQQFL": 1e-2,
            "ERAELQAG": 1e-2,
            "LGVASEDE": 1e-2,
            "SSVSTSNA": 1e-2,
            "GMDLDLAS": 1e-2,
            "VSMLSAIR": 1e-2,
            "GTYRCEAT": 1e-2,
            "PQNGGSSD": 1e-2,
            "RKAAVLIF": 1e-2,
            "QSTIPEHI": 1e-2,
            "LGLDKRQS": 1e-2,
            "FWDVVKRQ": 1e-2,
            "NHNQKCKE": 1e-2,
            "RCKFFSLT": 1e-2,
            "ASPRTVTI": 1e-2,
            # SPR 2. round
            "LDKQTQTP": 4e-6,
            "VDKETNTE": 8e-6,
            "VPKQTQTP": 5e-6,
            "YSKETQTP": 8e-6,
            "VDKAVQCS": 7e-6,
            "KDTGVQTD": 1e-5,
            "QHKECQTD": 9e-6,
            "RDQAVQTG": 9e-6
        }
        print("Creating the structures ...")
        # start = time.time()
        for _ in p.imap_unordered(self.main_func, self.PEPTIDE_KD.keys()):
            # print("Time elapsed on {} : {:.2f}min)".format(x, int(time.time() - start) / 60))
            pass
        print("Minimizing the domain...")
        proc = subprocess.Popen(
            '{}/source/bin/relax.linuxgccrelease -nstruct 1 -out:file:silent {}/domain_minimized.silent -out:file:silent_struct_type binary -s {} -score:weights {}/weights.wts'.format(
                self.ROSETTA_DIR, self.WORK_DIR, self.INPUT_UNBOUND, self.WORK_DIR), shell=True,
            stdout=subprocess.PIPE)
        out, err = proc.communicate()
        if err:
            sys.exit(err.decode())
        print("Structures are ready!")
        print('Reading energies to calculate the partition functions ...')
        # Reading the energies parallel
        self.COMPLEX_ENERGY = {}
        self.LIGAND_ENERGY = {}
        self.DOMAIN_ENERGY = energy('{}/domain_minimized.silent'.format(self.WORK_DIR))

        for x in p.imap_unordered(self.read_energies, self.PEPTIDE_KD.keys()):
            self.COMPLEX_ENERGY[x[0]] = x[1]
            self.LIGAND_ENERGY[x[0]] = x[2]
        #
        print('Finished reading energies ...')
        print('Minimizing the energy weights ...')
        bound = [(0, None) for _ in self.ENERGY_ATTRIBUTES]
        start_points = [1 for _ in self.ENERGY_ATTRIBUTES]
        # start_points = [0.049454984778022501, 1.0811964401313245e-16, 0.019417178401552562, 0.74966259917739431, -9.8627227576030013e-18, 0.7190273821671308, 0.81862557829921956,
        #                 0.82527639772869144, 0.48343139163199061, 0.73645662699706216, 0.4707545503593048, 1.705196654171699]

        res = minimize(self.minimizing_func, np.array(start_points), method='SLSQP', bounds=bound, options={'disp': True})
        print()
        print('Minimization done!')
        print('Final weights:')
        print("--------")
        print(list(res.x))
        print("--------")
        print('PEPTIDE\tPredicted Kd\tExperimantal Kd\tLogDif')
        self.minimizing_func([1 for _ in self.ENERGY_ATTRIBUTES], prt=True)
        print()
        print('Energy weights:')
        self.ener_dict = {}
        for num, j in enumerate(self.ENERGY_ATTRIBUTES):
            print(j, '\t{:.4f}'.format(res.x[num]))
            self.ener_dict[j] = res.x[num]

    def get_result(self):
        return self.ener_dict

    def mutate_ligand(self, seq):
        # tm = time.time()
        if os.path.isfile("input.pdb"):
            return
        # print("Mutating the ligand to {} ...".format(seq))
        with open("resfile", "w") as file_name:
            file_name.write("NATRO # default command that applies to everything without a non- default setting; do not repack\nSTART\n")
            for num, j in enumerate(seq):
                file_name.write("{} L PIKAA {} # Change 468 pos in A chain to Alanine\n".format(num + 468, j))
            proc = subprocess.Popen(
                '{}/source/bin/fixbb.linuxgccrelease -s /dlab/data/analysis/STRUCTURES/LC8/5e0m_prep.pdb -nstruct 1 -ex1 -ex2 -resfile resfile {}.pdb'.format(self.ROSETTA_DIR, seq,
                                                                                                                                                              seq),
                shell=True, stdout=subprocess.PIPE)
            _, err = proc.communicate()
        if err:
            sys.exit(err)
        os.rename("5e0m_prep_0001.pdb", "input.pdb")
        # print("Mutation of {} done in {:.2f} minutes".format(seq, (time.time() - tm) / 60))
        return

    def prepack(self, seq, overwrite=False):
        # tm = time.time()
        if os.path.isfile("prepacked.pdb") and not overwrite:
            # print("Preparing is done!\n")
            return
        # print("Prepacking the structure ...")
        proc = subprocess.Popen(
            '{1}/source/bin/FlexPepDocking.linuxgccrelease -database {1}/database -s {2} -flexpep_prepack -ex1 -ex2 -unboundrot {3} -score:weights weights.wts'.format(
                self.CPU, self.ROSETTA_DIR, "input.pdb", self.INPUT_UNBOUND), shell=True, stdout=subprocess.PIPE)
        _, err = proc.communicate()
        if err:
            sys.exit(err.decode())
        os.rename("input_0001.pdb", "prepacked.pdb")
        # print("Preparing of {} done in {:.2f} minutes!".format(seq, (time.time() - tm) / 60))
        return

    def dock(self, seq, overwrite=False):
        # tm = time.time()
        if os.path.isfile("docking_results.silent") and not overwrite:
            # print("Docking is done!\n")
            return
        # print("Docking the prepared structure (prepacked.pdb) ...")
        proc = subprocess.Popen(
            '{1}/source/bin/FlexPepDocking.linuxgccrelease -database {1}/database -s {2} -ex1 -ex2 -out:file:silent {3}.silent -out:file:silent_struct_type binary -pep_refine -nstruct 100 -unboundrot {4} -score:weights weights.wts'.format(
                self.CPU, self.ROSETTA_DIR, "prepacked.pdb", "docking_results", self.INPUT_UNBOUND),
            shell=True, stdout=subprocess.PIPE)
        _, err = proc.communicate()
        if err:
            sys.exit(err.decode())
        # print("Docking of {} done in {:.2f} minutes!".format(seq, (time.time() - tm) / 60))
        return

    # def random_ligand_generation(self):
    #     tm = time.time()
    #     if not os.path.isfile("input_L.pdb"):
    #         proc = subprocess.Popen(
    #             'python /dlab/home/gerdos/pywork/STRUCTURE/chain_splitter.py input.pdb L',
    #             shell=True, stdout=subprocess.PIPE)
    #         proc.communicate()
    #     if not os.path.isfile("ligand.silent"):
    #         proc = subprocess.Popen(
    #             '{}/source/bin/FlexPepDocking.linuxgccrelease -s input_L.pdb -pep_fold_only -torsionsMCM -nstruct 100 --out:file:silent ligand.silent'.format(self.ROSETTA_DIR),
    #             shell=True, stdout=subprocess.PIPE)
    #         proc.communicate()
    #     print("Random ligand generation with FlexPepDock is done in {:.2f} minutes!".format((time.time() - tm) / 60))

    def ligand_prep_fm(self, seq):
        tm = time.time()
        # Check if every minimization is done
        trg = False
        for num in range(100):
            if not os.path.isfile("{}/{}/RANDOM_LIGANDS/{}.silent".format(self.WORK_DIR, seq, str(num) + "a_8_minimized")):
                trg = True
        if not trg:
            return

        # Generating the random peptides
        if not os.path.isdir("/dlab/home/gerdos/bin/flexiblemeccano/simulations/{0}".format(seq)):
            shutil.copytree("/dlab/home/gerdos/bin/flexiblemeccano/simulations/sample", "/dlab/home/gerdos/bin/flexiblemeccano/simulations/{0}".format(seq))
            info_text = """Number_of_Amino_Acids                8
    Sequence_File                        /dlab/home/gerdos/bin/flexiblemeccano/simulations/{0}/Input/sequence.txt
    Contact_Restriction                  no
    Contacts_File                        /dlab/home/gerdos/bin/flexiblemeccano/simulations/{0}/Input/contacts.txt

    Number_of_Conformers                 100
    Output_Directory                     /dlab/home/gerdos/bin/flexiblemeccano/simulations/{0}
    J_Calculation                        no
    PRE_Calculation                      no
    RDC_Calculation                      yes

    Phi/Psi_Database                     /dlab/home/gerdos/bin/flexiblemeccano/database/database_options/phi_psi.txt

    Print_PDB                            yes

    J_Coupling                           - - -

    Number_of_Cysteines                  -
    Cysteine_Positions                   -
    Dynamic                              -
    Proton                               -
    Proton_Frequency                     - -
    Intensity                            -
    Intrinsic_Linewidth_Proton           -

    Global_Tensor                        yes
    N_HN                                 yes
    CA_HA                                yes
    CA_CO                                yes
    N_CO                                 yes
    HN_CO                                yes""".format(seq)

            sequence_text = ""
            for num, j in enumerate(list(seq)):
                sequence_text += "{}     {}  0     0.00     0.00  0.00    0.00\n".format(num + 1, j)
            with open("/dlab/home/gerdos/bin/flexiblemeccano/simulations/{}/Input/info.txt".format(seq), "w") as file_name:
                file_name.write(info_text)
            with open("/dlab/home/gerdos/bin/flexiblemeccano/simulations/{}/Input/sequence.txt".format(seq), "w") as file_name:
                file_name.write(sequence_text)
            proc = subprocess.Popen(
                '/dlab/home/gerdos/bin/flexiblemeccano/applications/flexible_meccano ~/bin/flexiblemeccano/simulations/{}/ /usr/bin/gnuplot'.format(seq), shell=True,
                stdout=subprocess.PIPE)
            res, err = proc.communicate()
            if err:
                sys.exit(err.decode())
            # PDB files generated by FM have a mistake, correcting it
            for num in range(100):
                with open("/dlab/home/gerdos/bin/flexiblemeccano/simulations/{}/Output/PDB/{}a_8.pdb".format(seq, num), "r") as file_name:
                    pseu_str = file_name.readlines()
                for j, line in enumerate(pseu_str):
                    if line.startswith("ATOM"):
                        if int(line.split()[1]) >= 54:
                            line = list(line)
                            line.insert(11, " ")
                    pseu_str[j] = "".join(line)
                with open("/dlab/home/gerdos/bin/flexiblemeccano/simulations/{}/Output/PDB/{}a_8.pdb".format(seq, num), "w") as file_name:
                    file_name.write("".join(pseu_str))
        if not os.path.isdir('{}/{}/RANDOM_LIGANDS'.format(self.WORK_DIR, seq)):
            shutil.copytree("/dlab/home/gerdos/bin/flexiblemeccano/simulations/{0}/Output/PDB".format(seq), "{}/{}/RANDOM_LIGANDS".format(self.WORK_DIR, seq))
        # Simulation
        for num in range(100):
            if not os.path.isfile("{}/{}/RANDOM_LIGANDS/{}.silent".format(self.WORK_DIR, seq, str(num) + "a_8_minimized")):
                proc = subprocess.Popen(
                    '{}/source/bin/relax.linuxgccrelease -nstruct 1 -out:file:silent {}/{}/RANDOM_LIGANDS/{}.silent -out:file:silent_struct_type binary -s {}/{}/RANDOM_LIGANDS/{} -score:weights weights.wts'.format(
                        self.ROSETTA_DIR, self.WORK_DIR, seq, str(num) + "a_8_minimized", self.WORK_DIR, seq, str(num) + "a_8.pdb"), shell=True, stdout=subprocess.PIPE)
                out, err = proc.communicate()
                if err:
                    sys.exit(err.decode())
        # print("Random ligand generation with FlexibleMeccano is done in {:.2f} minutes!".format((time.time() - tm) / 60))
        return

    def complex_minimization(self):
        # extract the docked PDBs
        if not os.path.isdir('COMPLEX_STRUCTURES'):
            os.mkdir('COMPLEX_STRUCTURES')
        os.chdir('COMPLEX_STRUCTURES')
        shutil.copy('../weights.wts', '.')
        # Check id PDBs have been extracted before
        tgl = False
        for num in range(100):
            if not os.path.isfile('prepacked_{:04d}.pdb'.format(num + 1)):
                tgl = True
        if tgl:
            proc = subprocess.Popen(
                '{}/source/bin/extract_pdbs.linuxgccrelease -in:file:silent ../docking_results.silent'.format(
                    self.ROSETTA_DIR), shell=True, stdout=subprocess.PIPE)
            proc.communicate()
        # Minimizing the complexes
        for num in range(100):
            if not os.path.isfile("{}.silent".format('prepacked_{:04d}_minimized'.format(num + 1))):
                proc = subprocess.Popen(
                    '{}/source/bin/relax.linuxgccrelease -nstruct 1 -out:file:silent {}.silent -out:file:silent_struct_type binary -s {} -score:weights weights.wts'.format(
                        self.ROSETTA_DIR, 'prepacked_{:04d}_minimized'.format(num + 1), 'prepacked_{:04d}.pdb'.format(num + 1)), shell=True, stdout=subprocess.PIPE)
                out, err = proc.communicate()
                if err:
                    sys.exit(err.decode())

    def read_energies(self, motif):
        # tm = time.time()
        os.chdir("{}/{}".format(self.WORK_DIR, motif))
        complx_ener = pd.DataFrame()
        for num in range(100):
            complx_ener = complx_ener.append(energy('COMPLEX_STRUCTURES/prepacked_{:04d}_minimized.silent'.format(num + 1))[self.ENERGY_ATTRIBUTES])
        lig_ener = pd.DataFrame()
        for num in range(100):
            lig_ener = lig_ener.append(energy('RANDOM_LIGANDS/{}.silent'.format(str(num) + "a_8_minimized"))[self.ENERGY_ATTRIBUTES])
        # print("Read energies for {} in {:.2f} minutes!".format(motif, (time.time() - tm) / 60))
        return motif, complx_ener, lig_ener

    @staticmethod
    def sigmoid(x):
        # return x
        # return math.log(x, math.e)
        return -math.tanh(0.15 * (math.log(x, math.e) - math.log(1e-4, math.e)))

    def minimizing_func(self, weights_lst, prt=False):
        # if not prt:
        #     print(weights_lst)
        res = 0
        # Energy DataFarmes should not me modified globally
        domain_ener = self.DOMAIN_ENERGY.copy()

        # Domain partial function calculation, independent of peptide
        domain_partial_function = np.float128()
        for num, j in enumerate(self.ENERGY_ATTRIBUTES):
            domain_ener.loc[:, j] *= weights_lst[num]
        for num, j in domain_ener.iterrows():
            domain_partial_function += math.e ** -np.sum([j[ener_type] for ener_type in self.ENERGY_ATTRIBUTES])
        # Ligand and complex partial functions are depending on the peptide
        for inner_key, kd_experimental in self.PEPTIDE_KD.items():
            # Copy the pd object, as it is immutable
            complex_ener = self.COMPLEX_ENERGY[inner_key].copy(deep=True)
            ligand_ener = self.LIGAND_ENERGY[inner_key].copy(deep=True)
            # Weighting the energies
            for num, j in enumerate(self.ENERGY_ATTRIBUTES):
                complex_ener.loc[:, j] *= weights_lst[num]
                ligand_ener.loc[:, j] *= weights_lst[num]
            # Complex partial function
            complex_partial_function = np.float128()
            for num, j in complex_ener.iterrows():
                complex_partial_function += math.e ** -np.sum([j[ener_type] for ener_type in self.ENERGY_ATTRIBUTES])
            # Ligand partial function
            ligand_partial_function = np.float128()
            for num, j in ligand_ener.iterrows():
                ligand_partial_function += math.e ** -np.sum([j[ener_type] for ener_type in self.ENERGY_ATTRIBUTES])
            try:
                res += (self.sigmoid(ligand_partial_function * domain_partial_function / complex_partial_function) - self.sigmoid(kd_experimental)) ** 2
            except ValueError:
                return np.NaN
            if prt:
                print('{}\t{:.4e}\t{:.4e}\t{:.4e}'.format(inner_key, ligand_partial_function * domain_partial_function / complex_partial_function, kd_experimental,
                                                          abs(math.log(ligand_partial_function * domain_partial_function / complex_partial_function, 10) - math.log(kd_experimental,
                                                                                                                                                                    10))))

        print('\rDifference: {:.4f}'.format(res),end='')
        return res

    def main_func(self, seq):
        # Creating the directory isf not exists, and go in there
        if not os.path.isdir("{}/{}".format(self.WORK_DIR, seq)):
            os.mkdir("{}/{}".format(self.WORK_DIR, seq))
        os.chdir("{}/{}".format(self.WORK_DIR, seq))
        shutil.copy('{}/weights.wts'.format(self.WORK_DIR), '.')
        self.mutate_ligand(seq)
        self.prepack(seq)
        self.dock(seq)

        # Ligand generation with FlexPepDock
        # random_ligand_generation()

        # Ligand generation with FlexibleMeccano
        self.ligand_prep_fm(seq)
        self.complex_minimization()
        return seq

        # def plotData(var, x, y, color1, u, v, color2):
        #     pearR = np.corrcoef(x, y)[1, 0]
        #     # least squares from:
        #     # http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.lstsq.html
        #     A = np.vstack([x, np.ones(len(x))]).T
        #     m, c = np.linalg.lstsq(A, np.array(y))[0]
        #     plt.figure(1)
        #     plt.subplot(211)
        #     # plt.xlim([min(x) * 1.1, max(x) * 0.9])
        #     plt.ylim([-8, -2])
        #     plt.scatter(x, y, label='Regi', color=color1)
        #     plt.legend(loc=2)
        #     plt.xlabel("log10(Kd)")
        #     plt.ylabel(var)
        #     plt.subplot(212)
        #     plt.scatter(u, v, label='Uj', color=color2)
        #     # plt.plot(x, x * m + c, color=color,
        #     #          label="R = %6.2e" % (pearR))
        #     plt.xlabel("log10(Kd)")
        #     plt.ylabel(var)
        #     # plt.xlim([min(x) * 1.1, max(x) * 0.9])
        #     plt.ylim([-8, -2])
        #     plt.legend(loc=2)
        #     plt.show()


INITIAL_WEIGHTS = {
    'fa_atr': 1,
    'fa_rep': 0.55,
    'fa_sol': 0.9375,
    'fa_intra_rep': 0.005,
    'fa_elec': 0.875,
    'pro_close': 0,
    'hbond_sr_bb': 1.17,
    'hbond_lr_bb': 1.17,
    'hbond_bb_sc': 1.17,
    'hbond_sc': 1.1,
    'dslf_fa13': 0,
    'rama': 0.25,
    'omega': 0.625,
    'fa_dun': 0.7,
    'p_aa_pp': 0.4,
    'yhh_planarity': 0,
    'ref': 0
}
new_energies = INITIAL_WEIGHTS
p = multiprocessing.Pool(processes=multiprocessing.cpu_count()-1)
print('Found {} cores to use'.format(multiprocessing.cpu_count() - 1))


START_POINT = 7
if os.path.isfile(('/dlab/data/analysis/STRUCTURES/LC8/ITERATION/ROUND_{}/weights.wts'.format(START_POINT - 1))):
    with open('/dlab/data/analysis/STRUCTURES/LC8/ITERATION/ROUND_{}/weights.wts'.format(START_POINT - 1)) as fh:
        for line in fh:
            new_energies[line.split()[0]] = float(line.split()[1].strip())

for i in range(START_POINT, 50):

    try:
        os.mkdir('/dlab/data/analysis/STRUCTURES/LC8/ITERATION/ROUND_{}'.format(i))
    except FileExistsError:
        pass
    print('Running the inner cycle with weights:')
    for key, val in sorted(new_energies.items()):
        print('{}\t{}'.format(key, val))
    print('----------------')
    if not os.path.isfile('/dlab/data/analysis/STRUCTURES/LC8/ITERATION/ROUND_{}/weights.wts'.format(i)):
        with open('/dlab/data/analysis/STRUCTURES/LC8/ITERATION/ROUND_{}/weights.wts'.format(i), 'w') as fn:
            for key, val in new_energies.items():
                fn.write("{} {}".format(key, val))
                fn.write('\n')
    returned_ener = InnerCycle('/dlab/data/analysis/STRUCTURES/LC8/ITERATION/ROUND_{}'.format(i)).get_result()
    for key, val in returned_ener.items():
        new_energies[key] = (1 / (i + 1) * returned_ener[key]) + (new_energies[key] * (1 - (1 / (i + 1))))
    print('----------------')
    print('Cycle {} complete'.format(i))
    print('----------------')
    gc.collect()

我知道它很长,而且大多数人没有时间调试我的失败。我是一个生物学家,我还没有很好的编程技能。你知道吗

我不明白为什么我的记忆会耗尽。如何调试这样的错误?你知道吗

简而言之,代码运行方式如下:

  1. 进行模拟(使用multiprocessing
  2. 加权结果 模拟(数据收集是并行进行的 有multiprocessing
  3. scipy最小化权重
  4. 使用给定的最小结果运行模拟,然后从 第二个。步骤

谢谢!你知道吗


Tags: inselfformatforifbintimeos