Webscraping：删除文档前20个字符中的单词？

import urllib2,sys,os from bs4 import BeautifulSoup,NavigableString from string import punctuation as p from multiprocessing import Pool import re, nltk import requests reload(sys) chester_url = 'http://millercenter.org/president/arthur/speeches/speech-3752' chester_3752 = urllib2.urlopen(chester_url).read() chester_3752 = BeautifulSoup(chester_3752) # find the speech itself within the HTML chester_3752 = chester_3752.find('div',{'id': 'transcript'},{'class': 'displaytext'}) # removes extraneous characters (e.g. '<br/>') chester_3752 = chester_3752.text.lower() # for further text analysis, remove punctuation punctuation = re.compile('[{}]+'.format(re.escape(p))) chester_3752 = punctuation.sub('', chester_3752) chester_3752 = chester_3752.replace('—',' ') chester_3752 = chester_3752.replace('transcript','')

2条回答

网友

1楼 · 编辑于 2024-09-30 00:37:59

不知道你的问题是什么，但当我用python3.4和bs4运行这个程序时，它删除了“transcript”和一堆标点符号(我拿出一堆include，把urllib2改成urllib.request）

import urllib.request
import re
from bs4 import BeautifulSoup

import re
from string import punctuation as p

chester_url = 'http://millercenter.org/president/arthur/speeches/speech-3752'
chester_3752 = urllib.request.urlopen(chester_url).read()
chester_3752 = BeautifulSoup(chester_3752)

# find the speech itself within the HTML
chester_3752 = chester_3752.find('div',{'id': 'transcript'},{'class': 'displaytext'})

# removes extraneous characters (e.g. '<br/>')
chester_3752 = chester_3752.text.lower()

# for further text analysis, remove punctuation
punctuation = re.compile('[{}]+'.format(re.escape(p)))

chester_3752 = punctuation.sub('', chester_3752)
chester_3752 = chester_3752.replace('—',' ')
chester_3752 = chester_3752.replace('transcript','')

print(chester_3752)

网友

2楼 · 编辑于 2024-09-30 00:37:59

我已经试过你的代码，它的作品很好，但有一个轻微的调整，我会建议。不要使用replace，而是使用^{}，以确保字符串确实以transcript开头。Replace将从整个字符串中删除所有出现的transcript，但您真正需要的是删除位于字符串开头的transcript

import urllib2
import sys
from string import punctuation as p
import re

reload(sys)

chester_url = 'http://millercenter.org/president/arthur/speeches/speech-3752'
chester_3752 = urllib2.urlopen(chester_url).read()
chester_3752 = BeautifulSoup(chester_3752)

# find the speech itself within the HTML
chester_3752 = chester_3752.find('div',{'id': 'transcript'},{'class': 'displaytext'})

# removes extraneous characters (e.g. '<br/>')
chester_3752 = chester_3752.text.lower()

# for further text analysis, remove punctuation
punctuation = re.compile('[{}]+'.format(re.escape(p)))

chester_3752 = punctuation.sub('', chester_3752)
chester_3752 = chester_3752.replace('-',' ')
print(chester_3752)

# chester_3752 = chester_3752.replace('transcript','') #avoid this as it will delete all instances of transcript in the string

if chester_3752.startswith("transcript"): #this ensures only transcript at the beginning of the string is deleted which is what you want
    chester_3752 =  chester_3752[10:].strip() 
print chester_3752

相关问题更多 >

编程相关推荐

热门问题

热门文章