在Python中,我尝试迭代6000个URL并下载它们
当我尝试使用少量迭代(4个URL)时,一切都按预期进行
with open("SECmasterURLs.txt",'r') as f:
byte_data = f.read()
count = 0
masterurls = byte_data.splitlines()
createFolder(r"/Users/egecikrikci/Desktop/SEC Scrape/ParsedDatas")
createFolder(r"/Users/egecikrikci/Desktop/SEC Scrape/MasterDatas")
ParsedFolder = (r"/Users/egecikrikci/Desktop/SEC Scrape/ParsedDatas/")
MasterFolder = (r"/Users/egecikrikci/Desktop/SEC Scrape/MasterDatas/")
for line in masterurls:
DataDownloader(line, ParsedFolder, MasterFolder)
process = psutil.Process(os.getpid())
__memoryusage__ = (process.memory_info().rss) # in bytes
print (__memoryusage__ / 1000000)
作为输出,它会按预期创建2个文件,并从SECmasterURLs.txt中列出的URL下载我的4个文件
但当我尝试使用6000 URL时,它会返回一个错误:
UnboundLocalError Traceback (most recent call last)
<ipython-input-32-cc04452d2aa1> in <module>
11 for line in xx:
12
---> 13 DataDownloader(line, ParsedFolder, MasterFolder)
14 process = psutil.Process(os.getpid())
15 __memoryusage__ = (process.memory_info().rss) # in bytes
<ipython-input-27-1ffb4717a449> in DataDownloader(file_url, folderforparsed, folderformaster)
25
26 # define a new dataset with out the header info.
---> 27 data_format = data[start_ind + 1:]
28
29 master_data = []
UnboundLocalError: local variable 'start_ind' referenced before assignment
下面是DataDownloader中的代码:
def DataDownloader(file_url, folderforparsed, folderformaster):
urlsplit = file_url.split('/')
urlsplit2 = urlsplit[8].split('.')
filenamebuilder = '{}{}'.format(urlsplit2[0],urlsplit2[1] + '.txt')
MasterFiles = open(folderforparsed + 'parsed' + filenamebuilder, 'w')
content = requests.get(file_url).content
count = 0
with open(folderformaster + filenamebuilder, 'wb') as f:
f.write(content)
# let's open it and we will now have a byte stream to play with.
with open(folderformaster + filenamebuilder,'rb') as f:
byte_data = f.read()
# Now that we loaded the data, we have a byte stream that needs to be decoded and then split by -------.
data = byte_data.decode("utf-8").split('----')
# We need to remove the headers, so look for the end of the header and grab it's index
for index, item in enumerate(data):
if "ftp://ftp.sec.gov/edgar/" in item:
start_ind = index
# define a new dataset with out the header info.
data_format = data[start_ind + 1:]
master_data = []
# now we need to break the data into sections, this way we can move to the final step of getting each row value.
for index, item in enumerate(data_format):
# if it's the first index, it won't be even so treat it differently
if index == 0:
clean_item_data = item.replace('\n','|').split('|')
clean_item_data = clean_item_data[8:]
else:
clean_item_data = item.replace('\n','|').split('|')
for index, row in enumerate(clean_item_data):
# when you find the text file.
if '.txt' in row:
# grab the values that belong to that row. It's 4 values before and one after.
mini_list = clean_item_data[(index - 4): index + 1]
if len(mini_list) != 0:
mini_list[4] = "https://www.sec.gov/Archives/" + mini_list[4]
master_data.append(mini_list)
#loop through each document in the master list.
for index, document in enumerate(master_data):
# create a dictionary for each document in the master list
document_dict = {}
document_dict['cik_number'] = document[0]
document_dict['company_name'] = document[1]
document_dict['form_id'] = document[2]
document_dict['date'] = document[3]
document_dict['file_url'] = document[4]
master_data[index] = document_dict
for document_dict in master_data:
# if it's a 10-K document pull the url and the name.
if document_dict['form_id'] == '10-K':
# get the components
comp_name = document_dict['company_name']
docu_url = document_dict['file_url']
form_type = document_dict['form_id']
print('-'*100)
print(comp_name)
print(docu_url)
print('Form Type is: {}'.format(form_type))
MasterFiles.write('-'*75)
MasterFiles.write('\n')
MasterFiles.write(comp_name)
MasterFiles.write('\n')
MasterFiles.write(docu_url)
MasterFiles.write('\n')
MasterFiles.write(form_type)
MasterFiles.write('\n')
count = count + 1
发生的事情是,当代码到达这一行时
因为启动标识尚未初始化,所以它会爆炸
该变量通过以下行初始化:
因此,如果数据不包含该字符串,则永远不会初始化start\u ind。对于正在处理的6000个URL中的某些子集,该字符串不能存在
相关问题 更多 >
编程相关推荐