Scrapy SQLalchemy外键未在SQLite中创建

from sqlalchemy import create_engine, Column, Table, ForeignKey, MetaData from sqlalchemy import Integer, String, Date, DateTime, Float, Boolean, Text from sqlalchemy.orm import relationship from sqlalchemy.ext.declarative import declarative_base from scrapy.utils.project import get_project_settings Base = declarative_base() def db_connect(): ''' Performs database connection using database settings from settings.py. Returns sqlalchemy engine instance ''' return create_engine(get_project_settings().get('CONNECTION_STRING')) def create_table(engine): Base.metadata.create_all(engine) class BoardInfo(Base): __tablename__ = 'boardInfos' id = Column(Integer, primary_key=True) boardName = Column('boardName', String(100)) threadInfosLink = relationship('ThreadInfo', back_populates='boardInfosLink') # One-to-Many with threadInfo class ThreadInfo(Base): __tablename__ = 'threadInfos' id = Column(Integer, primary_key=True) threadTitle = Column('threadTitle', String()) threadLink = Column('threadLink', String()) threadAuthor = Column('threadAuthor', String()) threadPost = Column('threadPost', Text()) replyCount = Column('replyCount', Integer) readCount = Column('readCount', Integer) boardInfos_id = Column(Integer, ForeignKey('boardInfos.id')) # Many-to-One with boardInfo boardInfosLink = relationship('BoardInfo', back_populates='threadInfosLink') # Many-to-One with boardInfo postInfosLink = relationship('PostInfo', back_populates='threadInfosLink') # One-to-Many with postInfo authorInfos_id = Column(Integer, ForeignKey('authorInfos.id')) # Many-to-One with authorInfo authorInfosLink = relationship('AuthorInfo', back_populates='threadInfosLink') # Many-to-One with authorInfo class PostInfo(Base): __tablename__ = 'postInfos' id = Column(Integer, primary_key=True) postOrder = Column('postOrder', Integer, nullable=True) postAuthor = Column('postAuthor', Text(), nullable=True) postContent = Column('postContent', Text(), nullable=True) postTimestamp = Column('postTimestamp', Text(), nullable=True) threadInfos_id = Column(Integer, ForeignKey('threadInfos.id')) # Many-to-One with threadInfo threadInfosLink = relationship('ThreadInfo', back_populates='postInfosLink') # Many-to-One with threadInfo authorInfos_id = Column(Integer, ForeignKey('authorInfos.id')) # Many-to-One with authorInfo authorInfosLink = relationship('AuthorInfo', back_populates='postInfosLink') # Many-to-One with authorInfo class AuthorInfo(Base): __tablename__ = 'authorInfos' id = Column(Integer, primary_key=True) threadAuthor = Column('threadAuthor', String()) postInfosLink = relationship('PostInfo', back_populates='authorInfosLink') # One-to-Many with postInfo threadInfosLink = relationship('ThreadInfo', back_populates='authorInfosLink') # One-to-Many with threadInfo

from sqlalchemy import exists, event from sqlalchemy.orm import sessionmaker from scrapy.exceptions import DropItem from .models import db_connect, create_table, BoardInfo, ThreadInfo, PostInfo, AuthorInfo from sqlalchemy.engine import Engine from sqlite3 import Connection as SQLite3Connection import logging @event.listens_for(Engine, "connect") def _set_sqlite_pragma(dbapi_connection, connection_record): if isinstance(dbapi_connection, SQLite3Connection): cursor = dbapi_connection.cursor() cursor.execute("PRAGMA foreign_keys=ON;") # print("@@@@@@@ PRAGMA prog is running!! @@@@@@") cursor.close() class DuplicatesPipeline(object): def __init__(self): ''' Initializes database connection and sessionmaker. Creates tables. ''' engine = db_connect() create_table(engine) self.Session = sessionmaker(bind=engine) logging.info('****DuplicatesPipeline: database connected****') def process_item(self, item, spider): session = self.Session() exist_threadLink = session.query(exists().where(ThreadInfo.threadLink == item['threadLink'])).scalar() exist_thread_replyCount = session.query(ThreadInfo.replyCount).filter_by(threadLink = item['threadLink']).scalar() if exist_threadLink is True: # threadLink is in DB if exist_thread_replyCount < item['replyCount']: # check if replyCount is more? return item session.close() else: raise DropItem('Duplicated item found and replyCount is not changed') session.close() else: # New threadLink to be added to BoardPipeline return item session.close() class BoardPipeline(object): def __init__(self): ''' Initializes database connection and sessionmaker Creates tables ''' engine = db_connect() create_table(engine) self.Session = sessionmaker(bind=engine) def process_item(self, item, spider): ''' Save scraped info in the database This method is called for every item pipeline component ''' session = self.Session() # Input info to boardInfos boardInfo = BoardInfo() boardInfo.boardName = item['boardName'] # Input info to threadInfos threadInfo = ThreadInfo() threadInfo.threadTitle = item['threadTitle'] threadInfo.threadLink = item['threadLink'] threadInfo.threadAuthor = item['threadAuthor'] threadInfo.threadPost = item['threadPost'] threadInfo.replyCount = item['replyCount'] threadInfo.readCount = item['readCount'] # Input info to postInfos # Due to info is in list, so we have to loop and add it. for num in range(len(item['postOrder'])): postInfoNum = 'postInfo' + str(num) postInfoNum = PostInfo() postInfoNum.postOrder = item['postOrder'][num] postInfoNum.postAuthor = item['postAuthor'][num] postInfoNum.postContent = item['postContent'][num] postInfoNum.postTimestamp = item['postTimestamp'][num] session.add(postInfoNum) # Input info to authorInfo authorInfo = AuthorInfo() authorInfo.threadAuthor = item['threadAuthor'] # check whether the boardName exists exist_boardName = session.query(exists().where(BoardInfo.boardName == item['boardName'])).scalar() if exist_boardName is False: # the current boardName does not exists session.add(boardInfo) # check whether the threadAuthor exists exist_threadAuthor = session.query(exists().where(AuthorInfo.threadAuthor == item['threadAuthor'])).scalar() if exist_threadAuthor is False: # the current threadAuthor does not exists session.add(authorInfo) try: session.add(threadInfo) session.commit() except: session.rollback() raise finally: session.close() return item

2条回答

网友

1楼 · 编辑于 2024-10-03 06:19:05

根据我的另一个答案评论中的讨论，下面是我将如何合理化您的模型，使它们对我更有意义

注意:

我已经删除了所有不必要的“信息”
我已经从您的模型定义中删除了显式列名，并将依赖SQLAlchemy的能力根据我的属性名为我推断这些列名
在“Post”对象中，我没有将属性命名为PostContent，这意味着内容与Post相关，因为这就是我们访问它的方式，而只是将属性称为“Post”
我已经删除了所有“链接”术语，在我认为您需要引用相关对象集合的地方，我已经提供了该对象的复数属性作为关系
我在Post模型中留下了一行供您删除。正如你所看到的，你不需要两次“作者”，一次作为相关对象，一次在帖子上，这违背了FKs的目的

通过这些更改，当您尝试从其他代码中使用这些模型时，很明显您需要在何处使用.append（），以及在何处只分配相关对象。对于给定的Board对象，您知道“threads”是一个仅基于属性名的集合，因此您将执行类似于b.threads.append(thread)

from sqlalchemy import create_engine, Column, Table, ForeignKey, MetaData
from sqlalchemy import Integer, String, Date, DateTime, Float, Boolean, Text
from sqlalchemy.orm import relationship
from sqlalchemy.ext.declarative import declarative_base

class Board(Base): 
    __tablename__ = 'board'
    id = Column(Integer, primary_key=True)
    name = Column(String(100)) 
    threads = relationship(back_populates='board')

class Thread(Base):
    __tablename__ = 'thread'
    id = Column(Integer, primary_key=True)
    title = Column(String())
    link = Column(String())
    author = Column(String())
    post = Column(Text())
    reply_count = Column(Integer)
    read_count = Column(Integer)

    board_id = Column(Integer, ForeignKey('Board.id'))
    board = relationship('Board', back_populates='threads')

    posts = relationship('Post', back_populates='threads')
    
    author_id = Column(Integer, ForeignKey('Author.id'))
    author = relationship('Author', back_populates='threads')

class Post(Base):
    __tablename__ = 'post'
    id = Column(Integer, primary_key=True)
    order = Column(Integer, nullable=True)
    author = Column(Text(), nullable=True)    # remove this line and instead use the relationship below
    content = Column(Text(), nullable=True)
    timestamp = Column(Text(), nullable=True)

    thread_id = Column(Integer, ForeignKey('Thread.id'))
    thread = relationship('Thread', back_populates='posts')
    
    author_id = Column(Integer, ForeignKey('Author.id')) 
    author = relationship('Author', back_populates='posts')

class AuthorInfo(Base):
    __tablename__ = 'author'
    id = Column(Integer, primary_key=True)
    name = Column(String())

    posts = relationship('Post', back_populates='author') 
    threads = relationship('Thread', back_populates='author')

网友

2楼 · 编辑于 2024-10-03 06:19:05

从我看到的代码中，我觉得您并不是在任何地方设置ThreadInfo.authorInfosLink或ThreadInfo.authorInfos_id（您的所有FK/关系也是如此）

对于要附加到ThreadInfo实例的相关对象，您需要创建它们，然后附加它们，如下所示：

        # Input info to authorInfo
        authorInfo = AuthorInfo()
        authorInfo.threadAuthor = item['threadAuthor'] 
        
        threadInfo.authorInfosLink = authorInfo

如果每个对象通过FK相关，则可能不希望将其session.add（）添加到每个对象中。您将希望：

实例化一个BoardInfo对象bi
然后实例化附加相关的ThreadInfo对象ti
将您的文件附加到相关对象上，例如bi.threadInfosLink = ti
在所有链接关系结束时，只需使用session.add(bi)将bi添加到会话中即可。所有相关对象都将通过它们的关系添加，FKs将是正确的

相关问题更多 >

编程相关推荐

热门问题

热门文章