为什么通过ORM 5-8x加载SQLAlchemy对象要比通过原始MySQLdb光标加载行慢？

mysql> use foo mysql> describe Foo; +-------+---------+------+-----+---------+-------+ | Field | Type | Null | Key | Default | Extra | +-------+---------+------+-----+---------+-------+ | id | int(11) | NO | PRI | NULL | | | A | int(11) | NO | | NULL | | | B | int(11) | NO | | NULL | | | C | int(11) | NO | | NULL | | +-------+---------+------+-----+---------+-------+ mysql> SELECT COUNT(*) FROM Foo; +----------+ | COUNT(*) | +----------+ | 1000000 | +----------+ mysql>

#!/usr/bin/python # -*- coding: utf-8 -*- import MySQLdb import sys import time import datetime class Foo: def __init__(self, a, b, c): self.a=a; self.b=b; self.c=c; try: start = datetime.datetime.now() con = MySQLdb.connect('localhost', 'root', 'xxx', 'foo') cur = con.cursor(); cur.execute("""SELECT * FROM Foo LIMIT 1000000""") print "query execution time: ", datetime.datetime.now()-start foos = []; for elem in cur: foos.append(Foo(elem[1], elem[2], elem[3])) con.commit() except MySQLdb.Error, e: print "Error %d: %s" % (e.args[0], e.args[1]) sys.exit(1) finally: if con: con.close() print "total time: ", datetime.datetime.now()-start

import sqlalchemy import datetime import MySQLdb from sqlalchemy import Column, Integer, create_engine from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker, relationship, backref Base = declarative_base() class Foo(Base): __tablename__ = 'Foo' id = Column(Integer, primary_key=True) A = Column(Integer(unsigned=False), nullable=False) B = Column(Integer(unsigned=False), nullable=False) C = Column(Integer(unsigned=False), nullable=False) engine = create_engine('mysql+mysqldb://root:xxx@localhost/foo') Session = sessionmaker(bind=engine) session = Session() start = datetime.datetime.now() foos = session.query(Foo).limit(1000000).all() print "total time: ", datetime.datetime.now()-start

from peewee import * import datetime; database = MySQLDatabase("foo", host="localhost", port=3306, user="root", passwd="xxx") class Foo(Model): id = IntegerField() A = IntegerField() B = IntegerField() C = IntegerField() class Meta: db_table = 'Foo' database = database start = datetime.datetime.now() foos = Foo.select() cnt=0; for i in foos: cnt=cnt+1 print "total time: ", datetime.datetime.now() - start

package herbert.hibernateorm; import java.util.List; import org.hibernate.Session; import org.hibernate.Transaction; import org.hibernate.SessionFactory; import org.hibernate.cfg.Configuration; public class App { public static void main(String[] args) throws Exception { SessionFactory factory = new Configuration().configure().buildSessionFactory(); Session session = factory.openSession(); Transaction tx = session.beginTransaction(); long start = System.currentTimeMillis(); List foos = session.createQuery("FROM Foo").list(); System.out.println(foos.size()); System.out.printf("total time: %d\n", System.currentTimeMillis() - start); session.close(); } }

package herbert.hibernateorm; public class Foo { private int id, a, b, c; public Foo() {} public Foo(int A, int B, int C) { this.a=A; this.b=B; this.c=C; } public int getId() { return id; } public void setId(int id) { this.id = id; } public int getA() { return a; } public void setA(int a) { this.a = a; } public int getB() { return b; } public void setB(int b) { this.b = b; } public int getC() { return c; } public void setC(int c) { this.c = c; } }

<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE hibernate-configuration PUBLIC "-//Hibernate/Hibernate Configuration DTD 3.0//EN" "http://hibernate.sourceforge.net/hibernate-configuration-3.0.dtd"> <hibernate-configuration> <session-factory> <property name="hibernate.dialect">org.hibernate.dialect.MySQLDialect</property> <property name="hibernate.connection.driver_class">com.mysql.jdbc.Driver</property> <property name="hibernate.connection.url">jdbc:mysql://localhost:3306/foo?zeroDateTimeBehavior=convertToNull</property> <property name="hibernate.connection.username">root</property> <property name="hibernate.connection.password">xxx</property> <mapping resource="hibernate.hbm.xml"/> </session-factory> </hibernate-configuration>

<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE hibernate-mapping PUBLIC "-//Hibernate/Hibernate Mapping DTD 3.0//EN" "http://hibernate.sourceforge.net/hibernate-mapping-3.0.dtd"> <hibernate-mapping> <class name="herbert.hibernateorm.Foo" table="Foo" catalog="foo"> <id name="id" type="int"> <column name="id" /> <generator class="assigned" /> </id> <property name="a" type="int"> <column name="A" not-null="true" /> </property> <property name="b" type="int"> <column name="B" not-null="true" /> </property> <property name="c" type="int"> <column name="C" not-null="true" /> </property> </class> </hibernate-mapping>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>herbert</groupId> <artifactId>hibernateORM</artifactId> <version>1.0-SNAPSHOT</version> <packaging>jar</packaging> <name>hibernateORM</name> <url>http://maven.apache.org</url> <repositories> <repository> <id>unknown-jars-temp-repo</id> <name>A temporary repository created by NetBeans for libraries and jars it could not identify. Please replace the dependencies in this repository with correct ones and delete this repository.</name> <url>file:${project.basedir}/lib</url> </repository> </repositories> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> <scope>test</scope> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.21</version> </dependency> <dependency> <groupId>org.hibernate</groupId> <artifactId>hibernate-core</artifactId> <version>4.0.1.Final</version> </dependency> <dependency> <groupId>org.hibernate</groupId> <artifactId>hibernate-entitymanager</artifactId> <version>4.0.1.Final</version> </dependency> <dependency> <groupId>org.hibernate.common</groupId> <artifactId>hibernate-commons-annotations</artifactId> <version>4.0.1.Final</version> </dependency> <dependency> <groupId>nz.ac.waikato.cms.weka</groupId> <artifactId>weka-dev</artifactId> <version>3.7.10</version> </dependency> <dependency> <groupId>commons-configuration</groupId> <artifactId>commons-configuration</artifactId> <version>1.9</version> </dependency> <dependency> <groupId>commons-net</groupId> <artifactId>commons-net</artifactId> <version>3.1</version> <classifier>examples</classifier> </dependency> <dependency> <groupId>com.google.code.gson</groupId> <artifactId>gson</artifactId> <version>2.2.2</version> </dependency> <dependency> <groupId>maven</groupId> <artifactId>maven-jetty-plugin</artifactId> <version>1.1</version> <type>plugin</type> </dependency> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.4</version> </dependency> <dependency> <groupId>com.kenai.nbpwr</groupId> <artifactId>org-slf4j-jdk14</artifactId> <version>1.6.1-201106101300</version> <type>nbm</type> </dependency> </dependencies> </project>

3条回答

网友

1楼 · 编辑于 2024-04-28 14:30:11

这不是我问题的答案，但可能有助于公众解决大型数据集的速度问题。我发现，选择一百万条记录通常可以在3秒钟内完成，但是连接可能会减慢进程。在这种情况下，一个有大约150k个Foo，这个Foo与1M条有1-many关系，那么选择那些使用JOIN的Foo可能会很慢，因为每个Foo返回大约6.5次。我发现，在python中分别选择两个表并使用dict连接它们比SQLAlchemy（大约25秒）快3倍，比使用连接的“裸”python代码快2倍（大约17秒）。在我的用例中，代码花费了8秒。选择1M个不带关系的记录（如上面的条形图示例）需要3秒钟。我用了这个代码：

#!/usr/bin/python
# -*- coding: utf-8 -*-

import MySQLdb
import sys
import time
import datetime
import inspect
from operator import itemgetter, attrgetter

# fetch all objects of class Class, where the fields are determined as the
# arguments of the __init__ constructor (not flexible, but fairly simple ;))
def fetch(Class, cursor, tablename, ids=["id"], where=None):
    arguments = inspect.getargspec(Class.__init__).args; del arguments[0];
    fields = ", ".join(["`" + tablename + "`.`" + column + "`" for column in arguments])
    sql = "SELECT " + fields + " FROM `" + tablename + "`"
    if where != None: sql = sql + " WHERE " + where
    sql=sql+";"
    getId = itemgetter(*[arguments.index(x) for x in ids])
    elements = dict()

    cursor.execute(sql)
    for record in cursor:
        elements[getId(record)] = Class(*record)
    return elements

# attach the objects in dict2 to dict1, given a 1-many relation between both
def merge(dict1, fieldname, dict2, ids):
    idExtractor = attrgetter(*ids)
    for d in dict1: setattr(dict1[d], fieldname, list())
    for d in dict2:
        dd = dict2[d]
        getattr(dict1[idExtractor(dd)], fieldname).append(dd)

# attach dict2 objects to dict1 objects, given a 1-1 relation
def attach(dict1, fieldname, dict2, ids):
    idExtractor = attrgetter(*ids)
    for d in dict1: dd=dict1[d]; setattr(dd, fieldname, dict2[idExtractor(dd)])

它帮助我加快了查询速度，但是我非常高兴听到专家们对这种方法可能的改进。

网友

2楼 · 编辑于 2024-04-28 14:30:11

下面是您的MySQL脚本的SQLAlchemy版本，与MySQLdb的三个版本相比，它在四秒钟内执行：

from sqlalchemy import Integer, Column, create_engine, MetaData, Table
import datetime

metadata = MetaData()

foo = Table(
    'foo', metadata,
    Column('id', Integer, primary_key=True),
    Column('a', Integer(), nullable=False),
    Column('b', Integer(), nullable=False),
    Column('c', Integer(), nullable=False),
)


class Foo(object):
    def __init__(self, a, b, c):
        self.a = a
        self.b = b
        self.c = c

engine = create_engine('mysql+mysqldb://scott:tiger@localhost/test', echo=True)
start = datetime.datetime.now()

with engine.connect() as conn:
    foos = [
        Foo(row['a'], row['b'], row['c'])
        for row in
        conn.execute(foo.select().limit(1000000)).fetchall()
    ]


print "total time: ", datetime.datetime.now() - start

运行时：

total time:  0:00:04.706010

下面是一个使用ORM完全加载对象行的脚本；通过避免使用yield per一次创建包含所有1M对象的固定列表，这将在SQLAlchemy master（rel 0.9为18秒）中运行13秒：

import time
from sqlalchemy import Integer, Column, create_engine, Table
from sqlalchemy.orm import Session
from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()


class Foo(Base):
    __table__ = Table(
        'foo', Base.metadata,
        Column('id', Integer, primary_key=True),
        Column('a', Integer(), nullable=False),
        Column('b', Integer(), nullable=False),
        Column('c', Integer(), nullable=False),
    )


engine = create_engine('mysql+mysqldb://scott:tiger@localhost/test', echo=True)

sess = Session(engine)

now = time.time()

# avoid using all() so that we don't have the overhead of building
# a large list of full objects in memory
for obj in sess.query(Foo).yield_per(100).limit(1000000):
    pass

print("Total time: %d" % (time.time() - now))

然后，我们可以分割这两种方法之间的差异，并使用ORM只加载单个列：

for obj in sess.query(Foo.id, Foo.a, Foo.b, Foo.c).yield_per(100).limit(1000000):
    pass

上述操作将在4秒内再次运行。

与原始MySQLdb游标相比，SQLAlchemy Core的比较更为贴切。如果对单个列使用ORM but查询，在最新版本中大约需要4秒。

在ORM级别，速度问题是因为在Python中创建对象很慢，SQLAlchemy ORM在获取对象时对这些对象应用大量的簿记，这是它履行其使用契约所必需的，包括工作单元、标识映射、紧急加载、集合等

要显著加快查询速度，请获取单个列而不是完整对象。请参见 http://docs.sqlalchemy.org/en/latest/faq/performance.html#result-fetching-slowness-orm描述了这一点。

与PeeWee相比，PW是一个简单得多的系统，功能更少，包括它与身份映射没有任何关系。即使使用PeeWee，只要是一个简单的ORM，它仍然需要15秒，这就证明了与使用纯C的原始MySQLdb fetch相比，cPython确实非常慢

与Java相比，Java VM要比cPython快得多。Hibernate异常复杂，但是Java VM由于JIT而非常快，甚至所有的复杂性都以更快的速度结束。如果要将Python与Java进行比较，请使用Pypy。

网友

3楼 · 编辑于 2024-04-28 14:30:11

炼金术很复杂。它必须处理将类型转换为底层数据库本机不支持的Python、具有继承的表、连接、缓存对象、保持一致性、转换的行、部分结果等等。看看sqlalchemy/orm/loading.py:instance_processor——这太疯狂了。

解决方案是拼凑并编译Python代码来处理特定查询的结果，就像Jinja2对模板所做的那样。到目前为止，还没有人做过这项工作，可能是因为常见的情况是两行（这种优化可能会比较麻烦），而需要处理大量数据的人会像您一样手工完成这项工作。

相关问题更多 >

编程相关推荐

热门问题

热门文章