使用python web爬虫来抓取twitter帐户

import requests # import database as db from bs4 import BeautifulSoup debug = True def getStartNode(): # Get the Twitter profile of the starting node global startNodeFollowing # Declare the nodes vars as global for use in external functions global startNodeFollowers global startNodeLink if not debug: # If debugging == False, allow the user to enter any starting node Twitter profile startNodeLink = input("Enter a link to the starting users Twitter profile\n[URL]: ")[:-1] # Get profile link, remove the last char from input (space char, needed to enter link in terminal) else: # If debugging == True, have predetermined starting node to save time during development startNodeLink = ("https://twitter.com/ckjellberg03") startNodeFollowers = (startNodeLink + "/followers") # Create a new var using the starting node's Twitter profile, append for followers and following URL pages startNodeFollowing = (startNodeLink + "/following")

def spider(): # Web Crawler getStartNode() print("\nUsing:", startNodeLink) urlFollowers = startNodeFollowers sourceCode = requests.get(urlFollowers) plainText = sourceCode.text # Source code of the URL (urlFollowers) in plain text format soup = BeautifulSoup(plainText,'lxml') # BeautifulSoup object to search through plainText for specific items/classes etc for link in soup.findAll('a', {'class': 'css-4rbku5 css-18t94o4 css-1dbjc4n r-1loqt21 r-1wbh5a2 r-dnmrzs r-1ny4l3l'}): # 'a' is a link in HTML (anchor), class is the Twitter class for a profile href = link.get(href) print(href) # Display everything found (development purposes)

2条回答

网友

1楼 · 编辑于 2024-10-02 10:22:35

抓取Twitter非常困难（相信我，我已经尝试了各种方法），你可以使用Twitter API，但它们有限制（你不能只知道关注者的姓名和号码），如果你想用Twitter API抓取一些信息，你可以使用以下代码：

from TwitterAPI import TwitterAPI, TwitterPager
import tweepy
from tweepy import Cursor
from datetime import datetime, date, time, timedelta

consumer_key = 'consumer key'
consumer_secret = 'consumer secret'
token = 'token'
token_secret = 'token secret'

auth= tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(token, token_secret)
api = tweepy.API(auth)

account_list = ['POTUS44']



for target in account_list:
    print("Getting data for " + target)
    item = api.get_user(target)
    print("name: " + item.name)
    print("screen_name: " + item.screen_name)
    print("description: " + item.description)
    print("statuses_count: " + str(item.statuses_count))
    print("friends_count: " + str(item.friends_count))
    print("followers_count: " + str(item.followers_count))

    tweets = item.statuses_count
    account_created_date = item.created_at
    delta = datetime.utcnow() - account_created_date
    account_age_days = delta.days
    print("Account age (in days): " + str(account_age_days))
    if account_age_days > 0:
      print("Average tweets per day: " + "%.2f"%(float(tweets)/float(account_age_days)))

    tweets = item.statuses_count
    account_created_date = item.created_at
    delta = datetime.utcnow() - account_created_date
    account_age_days = delta.days
    print("Account age (in days): " + str(account_age_days))
    if account_age_days > 0:
      print("Average tweets per day: " + "%.2f"%(float(tweets)/float(account_age_days)))

    hashtags = []
    mentions = []
    tweet_count = 0
    end_date = datetime.utcnow() - timedelta(days=30)
    for status in Cursor(api.user_timeline, id=target).items():
      tweet_count += 1
      if hasattr(status, "entities"):
        entities = status.entities
        if "hashtags" in entities:
          for ent in entities["hashtags"]:
            if ent is not None:
              if "text" in ent:
                hashtag = ent["text"]
                if hashtag is not None:
                  hashtags.append(hashtag)
        if "user_mentions" in entities:
          for ent in entities["user_mentions"]:
            if ent is not None:
              if "screen_name" in ent:
                name = ent["screen_name"]
                if name is not None:
                  mentions.append(name)
      if status.created_at < end_date:
        break

网友

2楼 · 编辑于 2024-10-02 10:22:35

下面是如何在没有API的情况下实现它。一些困难源于使用权利用户代理中的浏览器

import re, requests

headers = { 'User-Agent': 'UCWEB/2.0 (compatible; Googlebot/2.1; +google.com/bot.html)'}


def cleanhtml(raw_html):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, '', raw_html)
  return cleantext

content = ""
for user in ['billgates']:
    content += "============================\n\n"
    content += user + "\n\n"
    content += "============================\n\n"
    url_twitter = 'https://twitter.com/%s' % user
    resp = requests.get(url_twitter, headers=headers)  # Send request
    res = re.findall(r'<p class="TweetTextSize.*?tweet-text.*?>(.*?)</p>',resp.text)
    for x in res:
        x = cleanhtml(x)
        x = x.replace("&#39;","'")
        x = x.replace('&quot;','"')
        x = x.replace("&nbsp;"," ")
        content += x 
        content += "\n\n"
        content += " -"
        content += "\n\n"

相关问题更多 >

编程相关推荐

热门问题

热门文章