"对早晨明星熊猫使用的标签列表进行精炼"

2024-06-25 23:01:45 发布

您现在位置:Python中文网/ 问答频道 /正文

所以我有一个标签清单,我刮掉了纳斯达克网站纽约证券交易所的。你知道吗

import urllib.request
from bs4 import BeautifulSoup
import re

target = "NYSE"

def nyseTags():
    ret = []
    for k in range(1,64):
        source = "https://www.nasdaq.com/screening/companies-by-industry.aspx?exchange=NYSE&page="+str(k)
        print("ReadingPage:"+source)
        filehandle = urllib.request.urlopen(source)
        soup = BeautifulSoup(filehandle.read(), "html.parser")
        lines = soup.findAll("tr",{})
        #print(len(lines))
        temp = []
        for n,k in enumerate(lines):
            if(n == 0):
                continue
            try:
                tds = k.findAll("td",{})                
                pattern = re.compile(r'\s+')
                temp.append(re.sub(pattern, '', tds[1].getText()))
                print(str(n)+" : "+temp[-1])
            except:
                pass
        ret += temp
    return(ret)

def arrToCsv(arr,fn):
    strRep = "\n".join(arr)
    fh = open(fn,"w")
    fh.write(strRep)
    fh.close()

arrToCsv(nyseTags(),"nyseTags.lst")

它提供了我的初始列表:

DDD
MMM
WBAI
...
XOXO
XPO
XYL
AUY
YELP
YEXT
YRD
YPF
YUMC
YUM
ZAYO
ZEN
ZBH
ZB^A
ZB^G
ZB^H
ZBK
ZOES
ZTS
ZTO
ZUO
ZYME

现在我知道这些并不是我所需要的所有good标签,因为我希望能够访问2年的日窗口。你知道吗

所以我写了下面的脚本来过滤掉2年前我无法获取数据的标签,但是它挂起了一个看起来很有希望的开始。你知道吗

enter image description here

我不知道为什么这个脚本不能通过整个列表,但是任何输入都太好了!你知道吗

import datetime as dt
import matplotlib.pyplot as plt
from matplotlib import style
import pandas as pd
import pandas_datareader.data as web
import os
import operator
#C:\Users\Kevin\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pandas_datareader\fred.py

style.use('ggplot')
__NOW__ = dt.datetime.now()
__2YearsAgo1__ = __NOW__ - dt.timedelta(days=725)
__2YearsAgo__ = __NOW__ - dt.timedelta(days=731)

def fetch(tag,start,end):# start and end should be dt.datetime(), tag can be a string or array of strings
    if(type(tag) == str):
        df = web.DataReader(tag, 'morningstar', start, end)
        return(df)
    else:
        df = web.DataReader(tag, 'morningstar', start, end)
        ret = {}
        for tag, data in df.groupby(level=0):
            if(type(data)==pd.core.frame.DataFrame):
                ret[tag]=data
        return(ret)

def scrubTags(tagList):
    ret = []
    fails = []
    for n in range(0,int(len(tagList)/25)+1):
        if(n==len(tagList)/25):
            propTags = tagList[n*25:len(tagList)]
        else:
            propTags = tagList[n*25:n*25+25]
        try:
            temp = fetch(propTags,__2YearsAgo__,__2YearsAgo1__)
            ret += propTags
            print(str(n))
        except:
            print("Error in:"+"\t".join(propTags))
            for k in propTags:
                try:
                    temp = fetch(k,__2YearsAgo__,__2YearsAgo1__)
                    print("Passed:"+k)
                    ret.append(k)
                except:
                    fails.append(k)
                    print("Failed:"+k)
            #fails += propTags
    return((ret,fails))

def arrToCsv(arr,fn):
    strRep = "\n".join(arr)
    fh = open(fn,"w")
    fh.write(strRep)
    fh.close()

def loadNYSEtagList():
    fh = open("NYSETags.lst","r")
    text = fh.read()
    fh.close()
    return(text.split("\n"))

tagList = loadNYSEtagList()
scrubTags = scrubTags(tagList)
arrToCsv(scrubTags[0],"Scrubed_505_nyseTags.lst")
arrToCsv(scrubTags[1],"Failed_505_nyseTags.lst")

另外->;我知道有一个充满数据帧的字典而不是使用多索引是很奇怪的,但我不认为这是我的问题的原因。你知道吗


Tags: inimportforreturndeftagdttemp