在beautiful soup中将数据刮取并保存到csv中

2024-05-06 04:14:48 发布

您现在位置:Python中文网/ 问答频道 /正文

下面是要刮取的url

https://www.agtta.co.in/individuals.php

  • 我需要提取姓名、手机号码和电子邮件

  • 之后我需要保存到csv中

  • 我能用下面的代码刮取完整的数据

下面是使用用户代理提取的代码

 from bs4 import BeautifulSoup
    import urllib.request
    urls=['https://www.agtta.co.in/individuals.php']
    for url in urls:
        req = urllib.request.Request(
        url,
        headers={
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
        }
        )
        resp= urllib.request.urlopen(req)
        soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'),features='html.parser')
        scrape_data = soup.find('section', class_='b-branches')
        to_list = scrape_data .find_all_next(string=True)

我试过了

for biz in results:
    #print(biz)
    title = biz.findAll('h3', {'class': 'b-branches__title ui-title-inner ui-title-inner_lg'})
    print (title)

我得到了[<h3 class="b-branches__title ui-title-inner ui-title-inner_lg">SHRI RAMESHBHAI P. SAKARIYA</h3>]

提取如何删除标记时,标记出现

我的预期出局了

Name, Mobilenumber, Email

A, 333, mm@gmail.com`

Tags: inhttpsurluititlerequestwwwurllib
2条回答

以下是执行此操作的完整代码:

from bs4 import BeautifulSoup
import requests
import pandas as pd

headers={
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
        }
r = requests.get('https://www.agtta.co.in/individuals.php',headers = headers).text

soup = BeautifulSoup(r,'html5lib')

sections = soup.find_all('section',class_ = "b-branches")

names = []
phone_numbers = []
emails = []

for section in sections:
    name = section.h3.text
    names.append(name)
    phone_number = section.p.text
    phone_number = phone_number.split('Mobile No ')[1]
    phone_numbers.append(phone_number)
    try:
        email = section.find_all('div')[3].text
        email = email.split('Email ')[1]
        emails.append(email)
    except:
        emails.append(None)

details_dict = {"Names":names,
                "Phone Numbers":phone_numbers,
                "Emails":emails}
df = pd.DataFrame(details_dict)
df.to_csv("Details.csv",index = False)

输出:

enter image description here

希望这有帮助

from bs4 import BeautifulSoup
import urllib.request
import pandas as pd


urls=['https://www.agtta.co.in/individuals.php']
headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
    }
for url in urls:
    req = urllib.request.Request(url, headers=headers)
    resp= urllib.request.urlopen(req)
    soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'),features='html.parser')
    result = []
    for individual in soup.findAll("section", {"class": "b-branches"}):
        name = individual.h3.text
        phone_data = individual.find('p')
        phone = phone_data.text.replace("Mobile No","").strip() if phone_data else ""
        email_data = individual.select('div:contains("Email")')
        email = email_data[0].text.replace("Email","").strip() if email_data else ""
        result.append({"Name":name, "Phone": phone, "Email":email})
    output = pd.DataFrame(result)
    output.to_csv("Details.csv",index = False)

相关问题 更多 >