KeyError:“['jd']不在索引中”

Traceback (most recent call last): File "webCrawl.py", line 100, in <module> dataJobs[['jd']]= cleaned_list File "/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py", line 2414, in __setitem__ self._setitem_array(key, value) File "/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py", line 2442, in _setitem_array indexer = self.ix._convert_to_indexer(key, axis=1) File "/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.py", line 1230, in _convert_to_indexer raise KeyError('%s not in index' % objarr[mask])

# -*- coding: utf-8 -*- __author__ = 'Yiyou' import sys from bs4 import BeautifulSoup import re import pandas as pd import urllib2 import nltk reload(sys) sys.setdefaultencoding('utf-8') def webCrawl(url): """Given an indeed job url, return the whole text excluding script and style Input: url: String Output: content: String """ try: html = urllib2.urlopen(url).read() # Connect to the job posting except: return "" soup = BeautifulSoup(html, "html.parser") # Reference for this step: https://jessesw.com/Data-Science-Skills/ for script in soup(["script", "style"]): script.extract() # Remove these two elements from the BS4 object to get clean text content = soup.getText().lower() return content def extractUseful (content): if type(content) == float: #i return "notok" else: content = content.replace("\r"," ").replace("\n", " ") startwords = ["qualification", "responsibility", "require", "skill", "role", "experience", "demonstrate"] start = set([content.find(i) for i in startwords]) if (-1 in start): #if doesn't find then it will be -1 start.remove(-1) if (len(start) != 0): #if at least one of words is found start_pos = min(start) end_pos = content.find("days ago")-3 #end pos -3 is because we want to eliminate number if possible return content[start_pos:end_pos] else: return "notok" def process(text, filters=nltk.corpus.stopwords.words('english'), lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()): """ Normalizes case and handles punctuation Inputs: text: str: raw text lemmatizer: an instance of a class implementing the lemmatize() method (the default argument is of type nltk.stem.wordnet.WordNetLemmatizer) Outputs: list(str): tokenized text """ lemmatizer=nltk.stem.wordnet.WordNetLemmatizer() word_list = nltk.word_tokenize(text); lemma_list = []; for i in word_list: if i not in filters: try: lemma = lemmatizer.lemmatize(i); lemma_list.append(str(lemma)); except: pass return " ".join(lemma_list) if __name__ == '__main__': #construct filter for processor file = open("accountant.txt").read().lower() filters = set(nltk.word_tokenize(file)) filters.update(nltk.corpus.stopwords.words('english')) filters = list(filters) #webcrawling webContent = [] dataJobs = pd.read_csv("dataJobs.csv"); webContent = [] for i in dataJobs["url"]: content = webCrawl(i); webContent.append(content); #clean the crawled text cleaned_list = [] for j in webContent: cleaned = extractUseful(j); processed = process(cleaned, filters=nltk.corpus.stopwords.words('english'), lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()); cleaned_list.append(processed) #save to csv contents = pd.DataFrame({ "Content":webContent, "Cleaned": cleaned_list}) contents.to_csv("webcrawled.csv") dataJobs[['jd']]= cleaned_list dataJobs.to_csv("dataJobs_v2_crawled.csv")

__author__ = 'Yiyou' import sys import urllib2 import pandas as pd import xml import xml.etree.ElementTree as ET import numpy as np reload(sys) sys.setdefaultencoding('utf-8') def getTotalResults(query): """Obtain total number of jobs given a query Inputs: string: query, seperated by + Outputs: int: indicating no. of total jobs of the query """ #form url query = "\"" + query + "\"" #double quotes mean it's querying exact title url = "http://api.indeed.com/ads/apisearch?publisher=" + publisher_key + "&v=2&q="+query +"&l=&sort=&radius=&st=&jt=fulltime&start=0&limit=26&fromage=365&highlight=0&filter=&latlong=1&co=us&chnl=&userip=45.56.94.21&useragent=&v=2" #url = 'http://api.indeed.com/ads/apisearch?publisher=8710117352111766&v=2&limit=100000&format=json #read website response = urllib2.urlopen(url) content = response.read() #parse XML root = ET.fromstring(content) num = int(root.find('totalresults').text) return num def indeedrequest(query, start): """form the url using query and startNo Input: query: String, job title, using double quotes means exact wording in the title startNo : int, for mannually "turning page" as the indeed API has a restriction on number of jobs returned per query at 25 Output: content: String, the XML file read from constructed API url """ query = "\"" + query + "\"" url = "http://api.indeed.com/ads/apisearch?publisher=" + publisher_key + "&v=2&q="+query +"&l=&sort=&radius=&st=&jt=fulltime&start="+str(start)+"&limit=26&fromage=365&highlight=0&filter=&latlong=1&co=us&chnl=&userip=45.56.94.21&useragent=&v=2" response = urllib2.urlopen(url) content = response.read() return(content) def parseXMLtoDF(query, startNo): """parse xml file and then return a dataFrame of the 25 job results on the page Input: query: String, job title, using double quotes means exact wording in the title startNo : int, for mannually "turning page" as the indeed API has a restriction on number of jobs returned per query at 25 Output: positionDB: a dataframe containing all job details from the XML page """ #Read and parse XML file content = indeedrequest(query, startNo) root = ET.fromstring(content) #Iter through node result and store in dataframe position_nodes = root.iter('result') #obtain all 25 XML formated Job files as an iterator positionDB = pd.DataFrame() for position_node in position_nodes: #iterate through 25 XML formatted jobs position = position_node.getchildren() #obtain all tags and its content for one particular job #construct a row in the dataframe row = dict() for jd in position: #iterate through all tags row[jd.tag] = jd.text #append the row into positionDB positionDB = positionDB.append(row, ignore_index=True) return(positionDB) def queryJobs(query): """Given a query, obtain all the job results as much as the API could return Input: query: String, job title, using double quotes means exact wording in the title Output: dataframe, containing all the job details and query """ total = min(1025,getTotalResults(query)) #as the API has a constrain at 1025 records to return at maximum start = 0 # for mannually "turning page" as the indeed API has a restriction on number of jobs returned per query at 25 jobs = [] while(start <= total): jobs.append(parseXMLtoDF(query, start)) #append dataframe on each page to jobs start += 25 #"turn the page" allDf = pd.concat(jobs) #concate all the dataframe to one allDf['query'] = query #record the query return allDf def queryAllJobs(queries): """Given a list of queries, obtain all the job results as much as the API could return Input: queries: List of String, job title, using double quotes means exact wording in the title Output: dataframe, containing all the job details and query """ dataJobs = [] for i in queries: dataJobs.append(queryJobs(i)); dataJobs = pd.concat(dataJobs) #drop duplicated record from the dataframe, given unique jobkey dataJobs = dataJobs.drop_duplicates(subset = "jobkey", keep = "first") return dataJobs if __name__ == '__main__': publisher_key = " " data = ["data+scientist", "data+engineer","data+analyst", "business+analyst","marketing+analyst", "machine+learning", "mechanical+engineer"] queryAllJobs(data).to_csv("dataJobs.csv")

0条回答

目前没有回答

相关问题更多 >

编程相关推荐

热门问题

热门文章