刮取网页的所有URL

from bs4 import BeautifulSoup from urllib.request import Request, urlopen import re req = Request("https://www.gbgb.org.uk//greyhound-profile//") html_page = urlopen(req) soup = BeautifulSoup(html_page, "lxml") links = [] for link in soup.findAll('a'): links.append(link.get('href')) print(links)

2条回答

网友

1楼 · 编辑于 2024-09-25 00:35:15

您可以将结果内容转换为数据帧，然后只需使用winnerOr2ndName和winnerOr2ndId列即可

范例

import json
import requests
import pandas as pd

def get_items(dog_id):
    url = f"https://api.gbgb.org.uk/api/results/dog/{dog_id}?page=-1"
    params = {"page": "-1", "itemsPerPage": "20", "race_type": "race"}
    response = requests.get(url, params=params).json()
    MAX_PAGES = response["meta"]["pageCount"]
    result = pd.DataFrame(pd.DataFrame(response["items"]).loc[:, ['winnerOr2ndName','winnerOr2ndId']].dropna())
    result["winnerOr2ndId"] = result["winnerOr2ndId"].astype(int)
    
    while int(params.get("page"))<MAX_PAGES:
        params["page"] = str(int(params.get("page")) + 1)
        response = requests.get(url, params=params).json()
        new_items = pd.DataFrame(pd.DataFrame(response["items"]).loc[:, ['winnerOr2ndName','winnerOr2ndId']].dropna())
        new_items["winnerOr2ndId"] = new_items["winnerOr2ndId"].astype(int)
        result = pd.concat([result, new_items])
    
    return result.drop_duplicates()

它将生成如下所示的数据帧：

网友

2楼 · 编辑于 2024-09-25 00:35:15

从外部API URL调用加载数据。您可以使用下一个示例来加载数据（使用ID）：

import json
import requests


api_url = "https://api.gbgb.org.uk/api/results/dog/517801"  # <  517801 is the ID from your URL in the question
params = {"page": "1", "itemsPerPage": "20", "race_type": "race"}

page = 1
while True:
    params["page"] = page
    data = requests.get(api_url, params=params).json()

    # uncomment this to print all data:
    # print(json.dumps(data, indent=4))

    if not data["items"]:
        break

    for i in data["items"]:
        print(
            "{:<30} {}".format(
                i.get("winnerOr2ndName", ""), i.get("winnerOr2ndId", "")
            )
        )

    page += 1

印刷品：

Ferndale Boom                  534358
Laganore Mustang               543937
Tickity Kara                   535237
Thor                           511842
Ballyboughlewiss               519556
Beef Cakes                     551323
Distant Millie                 546674
Lissan Kels                    525148
Rosstemple Marko               534276
Happy Harry                    550042
Porthall Ella                  550841
Southlodge Eden                531677
Effernogue Beef                547416
Faydas Truffle                 528780
Johns Lass                     538763
Faydas Truffle                 528780
Toms Hero                      543659
Affane Buzz                    547555
Emkay Flyer                    531456
Ballymac Tilly                 492923
Kilcrea Duke                   542178
Sporting Sultan                541880
Droopys Poet                   542020
Shortwood Elle                 527241
Rosstemple Marko               534276
Erics Bozo                     541863
Swift Launch                   536667
Longsearch                     523017
Swift Launch                   536667
Takemyhand                     535023
Floral Print                   527192
Rustys Aero                    497270
Autumn Dapper                  519528
Droopys Kiwi                   511989
Deep Chest                     520634
Newtack Henry                  525511
Indian Nightmare               524636
Lady Mascara                   528399
Tarsna Yankee                  517373
                               
Leathems Act                   516918
Final Star                     514015
Ascot Faye                     500812
Ballymac Ernie                 503569

相关问题更多 >

编程相关推荐

热门问题

热门文章

刮取网页的所有URL

相关问题 更多 >

编程相关推荐

热门问题

热门文章

相关问题更多 >