Wikipedia Infobox解析器，提供多语言支持

class WikipediaInfobox(): # Class to get and parse the Wikipedia Infobox Data infoboxArrayUnprocessed = [] # Maintains the order which the data is displayed. infoboxDictUnprocessed = {} # Still Contains Brackets and Wikitext coding. Will be processed more later... language="en" def getInfoboxDict(self, infoboxRaw): # Get the Infobox in Dict and Array form (Unprocessed) if infoboxRaw.strip() == "": return {} boxLines = [line.strip().replace(" "," ") for line in infoboxRaw.splitlines()] wikiObjectType = boxLines[0] infoboxData = [line[1:] for line in boxLines[1:]] toReturn = {"wiki_type":wikiObjectType} for i in infoboxData: key = i.split("=")[0].strip() value = "" if i.strip() != key + "=": value=i.split("=")[1].strip() self.infoboxArrayUnprocessed.append({key:value}) toReturn[key]=value self.infoboxDictUnprocessed = toReturn return toReturn def getInfoboxRaw(self, pageTitle, followRedirect = False, resetOld=True): # Get Infobox in Raw Text if resetOld: infoboxDict = {} infoboxDictUnprocessed = {} infoboxArray = [] infoboxArrayUnprocessed = [] params = { "format":"xml", "action":"query", "prop":"revisions", "rvprop":"timestamp|user|comment|content" } params["titles"] = "%s" % urllib.quote(pageTitle.encode("utf8")) qs = "&".join("%s=%s" % (k, v) for k, v in params.items()) url = "http://" + self.language + ".wikipedia.org/w/api.php?%s" % qs tree = etree.parse(urllib.urlopen(url)) revs = tree.xpath('//rev') if len(revs) == 0: return "" if "#REDIRECT" in revs[-1].text and followRedirect == True: redirectPage = revs[-1].text[revs[-1].text.find("[[")+2:revs[-1].text.find("]]")] return self.getInfoboxRaw(redirectPage,followRedirect,resetOld) elif "#REDIRECT" in revs[-1].text and followRedirect == False: return "" infoboxRaw = "" if "{{Infobox" in revs[-1].text: # -> No Multi-language support: infoboxRaw = revs[-1].text.split("{{Infobox")[1].split("}}")[0] return infoboxRaw def __init__(self, pageTitle = "", followRedirect = False): # Constructor if pageTitle != "": self.language = guess_language.guessLanguage(pageTitle) if self.language == "UNKNOWN": self.language = "en" infoboxRaw = self.getInfoboxRaw(pageTitle, followRedirect) self.getInfoboxDict(infoboxRaw) # Now the parsed data is in self.infoboxDictUnprocessed

1条回答

网友

1楼 · 发布于 2024-09-30 22:23:58

现在，如果你想获取结构化数据，Wikidata绝对是首选，不管怎样，如果你将来需要解析wikipedia文章中的数据，尤其是当你使用Python时，我可以推荐mwparserfromhell这是一个Python库，旨在解析wikitext，它有一个提取模板及其属性的选项。这并不能直接解决您的问题，因为多种语言的多个模板肯定会有所不同，但如果您继续尝试解析wikitext，这可能会很有用。在

相关问题更多 >

编程相关推荐

热门问题

热门文章