Beautifulsoup无法从社交媒体网站获取正确的信息

2024-09-29 00:19:18 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在尝试建立一个程序,该程序将刮取某个影响者的顶级社交媒体档案,然后吐出相关信息,比如他们有多少追随者。我觉得收集个人资料链接似乎很有效。我的问题是,我不知道如何从TikTok(或目前的任何其他网站)的网页上获取追随者数量。有人对如何获取这些信息有什么建议吗?我也很感激关于如何使代码更干净/更快的任何提示,因为这个项目是我第一次真正使用Python。不过,我对Java很在行

from googlesearch import search
import requests
from bs4 import BeautifulSoup
from urllib import request, response, error, parse
from urllib.request import urlopen
import requests


results = []
query = input("search: " )

def doSearch(tldIn, num):
    tiktok = "false"
    instagram = "false"
    facebook = "false"
    snapchat = "false"
    twitch = "false"
    youtube = "false"
    tiktok = "false"
    twitter = "false"
    bday = "false"
    for url in search(query, tld=tldIn, stop=num): 
        #This is all so I don't get multiple links to their profile. 
        #Twitter is the biggest culprit here since Google likes to embed tweets in its search results
        if "tiktok.com" in url and tiktok == "false":
           tiktok = "true"
           results.append(url)
        if "instagram.com" in url and instagram == "false":
           instagram = "true"
           results.append(url)
        if "facebook.com" in url and facebook == "false":
           facebook = "true"
           results.append(url)
        if "twitter.com" in url and twitter == "false":
           twitter = "true"
           results.append(url)
        if "snapchat.com" in url and snapchat == "false":
           snapchat = "true"
           results.append(url)
        if "twitch.com" in url and twitch == "false":
           twitch = "true"
           results.append(url)
        if "youtube.com" in url and youtube == "false":
           youtube = "true"
           results.append(url)
        if "famousbirthdays.com" in url and bday == "false":
           bday = "true"
           results.append(url)
           
def scrapeSites():
    for i in results:
        site = i
        #print('site:' + site)
        html = requests.get(site)
        print(html.status_code)
        soup = BeautifulSoup(html.content, 'lxml')
        #number = soup.find(class_='number').text
    
        """title = soup.title 
        titleText = title.get_text()"""
        #print(titleText)
        if "tiktok.com" in i:
            print(soup.prettify())
            foll = soup.find_all('div', class_='number')
            print(foll)
            h2 = soup.find_all('h2', class_='count-infos')
            for elem in h2:
                wrappers = elem.find_all('number')
                for x in wrappers:
                    followers = x.find('strong', title_='Followers').getText()
                    print("TikTok FOllowers: " + followers)
       
        
    
doSearch('com', 10)
if len(results) < 3:
    results.clear()
    doSearch('com', 20)
scrapeSites()
print(*results, sep = "\n") 

Tags: andinfromimportcomfalsetrueurl