在使用PyRSS2Gen生成xml时,如何将网站更新反映到xml中,同时保持现有xml内容的完整性?

2024-06-15 00:00:53 发布

您现在位置:Python中文网/ 问答频道 /正文

我一直在尝试使用Python3中的模块PyRSS2Gen为站点生成RSS,通过使用下面的代码,我已经能够生成xml文件。然后,我可以在RSS阅读器(Liferea)中打开xml文件并读取内容

但是,如果网站更新了一篇新文章,我如何在保留现有xml文件项不变的同时在xml文件中反映更新的项目

全文如下:

    #!/usr/bin/env python3
# -*- coding: utf-8 -*-

import urllib.request
import re   
import datetime
import PyRSS2Gen
import itertools


def target_url():

    url = "xxx"

    return url

def anti_anti_crawling():

    header = {
    'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}

    return header



def parse_response_to_html():

    request = urllib.request.Request(target_url(), headers = anti_anti_crawling())
    response = urllib.request.urlopen(request) 
    read_response = response.read()
    decode = read_response.decode('utf-8')

    return decode




def extract_elements_for_xml():

    title_search_pattern = re.compile(r'(?<=</em>).*(?=</h3>)')
    title_raw = title_search_pattern.findall(parse_response_to_html())


    link_search_pattern = ['','','','','','','']
    link_raw = ['','','','','','','']


    description_search_pattern = re.compile(r'(?<=<p><em>).*(?=</p>)')
    description_raw = description_search_pattern.findall(parse_response_to_html())


    pubDate_search_pattern = re.compile(r'(?<=<span class="time">).*(?=</span>)')
    pubDate_raw = pubDate_search_pattern.findall(parse_response_to_html())


    return title_raw, link_raw, description_raw, pubDate_raw


def list_generation():


    title = extract_elements_for_xml()[0]
    link = extract_elements_for_xml()[1]
    description = extract_elements_for_xml()[2]
    pubDate = extract_elements_for_xml()[3]


    listT = list(map(list,zip(title,link,description,pubDate)))

    return listT




def items_generation():
    items = []
    for i in list_generation():
        _title,_link,_description,_pubDate = i
        item =   PyRSS2Gen.RSSItem(
                    title = _title,
                    link = _link,
                    description = _description,
                    pubDate = _pubDate
                    )   
        items.append(item)

    return items



def rss_generation():
    rss = PyRSS2Gen.RSS2(  
            title =  "xxx",
            link = "xxx.com",
            description = "",
            lastBuildDate = datetime.datetime.now(),
            items = items_generation())
    return rss


xmlFileName = "xxxxx"
rss_generation().write_xml(open(xmlFileName, "w"))

谢谢你的帮助


Tags: importforsearchrawreturntitleresponsedef