“UnboundLocalError:赋值前引用的局部变量“？”在大量迭代中出错

with open("SECmasterURLs.txt",'r') as f: byte_data = f.read() count = 0 masterurls = byte_data.splitlines() createFolder(r"/Users/egecikrikci/Desktop/SEC Scrape/ParsedDatas") createFolder(r"/Users/egecikrikci/Desktop/SEC Scrape/MasterDatas") ParsedFolder = (r"/Users/egecikrikci/Desktop/SEC Scrape/ParsedDatas/") MasterFolder = (r"/Users/egecikrikci/Desktop/SEC Scrape/MasterDatas/") for line in masterurls: DataDownloader(line, ParsedFolder, MasterFolder) process = psutil.Process(os.getpid()) __memoryusage__ = (process.memory_info().rss) # in bytes print (__memoryusage__ / 1000000)

UnboundLocalError Traceback (most recent call last) <ipython-input-32-cc04452d2aa1> in <module> 11 for line in xx: 12 ---> 13 DataDownloader(line, ParsedFolder, MasterFolder) 14 process = psutil.Process(os.getpid()) 15 __memoryusage__ = (process.memory_info().rss) # in bytes <ipython-input-27-1ffb4717a449> in DataDownloader(file_url, folderforparsed, folderformaster) 25 26 # define a new dataset with out the header info. ---> 27 data_format = data[start_ind + 1:] 28 29 master_data = [] UnboundLocalError: local variable 'start_ind' referenced before assignment

def DataDownloader(file_url, folderforparsed, folderformaster): urlsplit = file_url.split('/') urlsplit2 = urlsplit[8].split('.') filenamebuilder = '{}{}'.format(urlsplit2[0],urlsplit2[1] + '.txt') MasterFiles = open(folderforparsed + 'parsed' + filenamebuilder, 'w') content = requests.get(file_url).content count = 0 with open(folderformaster + filenamebuilder, 'wb') as f: f.write(content) # let's open it and we will now have a byte stream to play with. with open(folderformaster + filenamebuilder,'rb') as f: byte_data = f.read() # Now that we loaded the data, we have a byte stream that needs to be decoded and then split by -------. data = byte_data.decode("utf-8").split('----') # We need to remove the headers, so look for the end of the header and grab it's index for index, item in enumerate(data): if "ftp://ftp.sec.gov/edgar/" in item: start_ind = index # define a new dataset with out the header info. data_format = data[start_ind + 1:] master_data = [] # now we need to break the data into sections, this way we can move to the final step of getting each row value. for index, item in enumerate(data_format): # if it's the first index, it won't be even so treat it differently if index == 0: clean_item_data = item.replace('\n','|').split('|') clean_item_data = clean_item_data[8:] else: clean_item_data = item.replace('\n','|').split('|') for index, row in enumerate(clean_item_data): # when you find the text file. if '.txt' in row: # grab the values that belong to that row. It's 4 values before and one after. mini_list = clean_item_data[(index - 4): index + 1] if len(mini_list) != 0: mini_list[4] = "https://www.sec.gov/Archives/" + mini_list[4] master_data.append(mini_list) #loop through each document in the master list. for index, document in enumerate(master_data): # create a dictionary for each document in the master list document_dict = {} document_dict['cik_number'] = document[0] document_dict['company_name'] = document[1] document_dict['form_id'] = document[2] document_dict['date'] = document[3] document_dict['file_url'] = document[4] master_data[index] = document_dict for document_dict in master_data: # if it's a 10-K document pull the url and the name. if document_dict['form_id'] == '10-K': # get the components comp_name = document_dict['company_name'] docu_url = document_dict['file_url'] form_type = document_dict['form_id'] print('-'*100) print(comp_name) print(docu_url) print('Form Type is: {}'.format(form_type)) MasterFiles.write('-'*75) MasterFiles.write('\n') MasterFiles.write(comp_name) MasterFiles.write('\n') MasterFiles.write(docu_url) MasterFiles.write('\n') MasterFiles.write(form_type) MasterFiles.write('\n') count = count + 1

1条回答

网友

1楼 · 发布于 2024-09-28 01:34:07

发生的事情是，当代码到达这一行时

data_format = data[start_ind + 1:]

因为启动标识尚未初始化，所以它会爆炸

该变量通过以下行初始化：

for index, item in enumerate(data):
        if "ftp://ftp.sec.gov/edgar/" in item:
            start_ind = index

因此，如果数据不包含该字符串，则永远不会初始化start\u ind。对于正在处理的6000个URL中的某些子集，该字符串不能存在

相关问题更多 >

编程相关推荐

热门问题

热门文章