解析Python Lis

{"time": 12.640, "name": "machine1", "value": 24.0} {"time": 12.645, "name": "machine2", "value": 0.0} {"time": 12.65002, "name": "machine3", "value": true} {"time": 12.66505, "name": "machine4", "value": 1.345} {"time": 12.67007, "name": "machine5", "value": 5.068} {"time": 12.67508, "name": "machine4", "value": 1.075} {"time": 12.6801, "name": "machine5", "value": 2.0868} {"time": 12.6851, "name": "machine4", "value": 0.0} {"time": 12.6901, "name": "machine5", "value": 12.633} {"time": 12.69512, "name": "machine5", "value": 13.13} {"time": 12.70013, "name": "machine3", "value": false} {"time": 12.70515, "name": "machine3", "value": false} {"time": 12.71016, "name": "machine3", "value": false} {"time": 12.71517, "name": "machine5", "value": 131.633}

import json data = []; timestamp =[]; with open('raw.json') as f: for line in f: data.append(json.loads(line)) f.close() for idx, val in enumerate(data): time = data[idx]['time'] name = data[idx]['name'] value = data[idx]['value'] data_list = idx+1, time, name, value print data_list

(1, 12.64, u'machine1', 24.0) (2, 12.645, u'machine2', 0.0) (3, 12.65002, u'machine3', True) (4, 12.66505, u'machine4', 1.345) (5, 12.67007, u'machine5', 5.068) (6, 12.67508, u'machine4', 1.075) (7, 12.6801, u'machine5', 2.0868) (8, 12.6851, u'machine4', 0.0) (9, 12.6901, u'machine5', 12.633) (10, 12.69512, u'machine5', 13.13) (11, 12.70013, u'machine3', False) (12, 12.70515, u'machine3', False) (13, 12.71016, u'machine3', False) (14, 12.71517, u'machine5', 131.633)

2条回答

网友

1楼 · 编辑于 2024-05-27 11:17:41

第一种解决方案

下面是我如何处理这个问题的：

import json
import collections

if __name__ == '__main__':    
    # Load file into data
    with open('raw.json') as f:
        data = [json.loads(line) for line in f]

    # Calculate count and total
    time_total = collections.defaultdict(float)
    time_count = collections.defaultdict(int)
    for row in data:
        time_count[row['name']] += 1
        time_total[row['name']] += row['time']

    # Calculate average
    time_average = {}
    for name in time_count:
        time_average[name] = time_total[name] / time_count[name]

    # Report
    for name in sorted(time_count):
        print '{:<10} {:2} {:8.2f} {:8.2f}'.format(
            name,
            time_count[name],
            time_total[name],
            time_average[name])

讨论

data是一个dict列表，其中包含name，time。。。
我使用了另外三个字典来记录每台机器的计数、总数和平均值。
我想你需要根据时间值来计算。如果不是，那就很容易解决。
defaultdict是一种很好的计数方法。如果尚未创建int值，则将创建该值并将其赋值为0，非常方便。你应该查一下。

第二种解决方案

这里有一种不同的方法：既然您的数据看起来像一个表，为什么不使用数据库来处理您的数据。这种方法的优点是你不必自己计算。

import json
import sqlite3

if __name__ == '__main__':
    # Create an in-memory database for calculation
    connection = sqlite3.connect(':memory:')
    cursor = connection.cursor()
    cursor.execute('DROP TABLE IF EXISTS time_table')
    cursor.execute('CREATE TABLE time_table (name text, time real)')
    connection.commit()

    # Load file into database
    with open('raw.json') as f:
        for line in f:
            row = json.loads(line)
            cursor.execute('INSERT INTO time_table VALUES (?,?)', (row['name'], row['time']))
            connection.commit()

    # Report: print the name, count, sum, and average
    cursor.execute('SELECT name, COUNT(time), SUM(time), AVG(time) FROM time_table GROUP BY name')
    print '%-10s %8s %8s %8s' % ('NAME', 'COUNT', 'SUM', 'AVERAGE')
    for row in cursor.fetchall():
        print '%-10s %8d %8.2f %8.2f' % row

    connection.close()

输出

NAME          COUNT      SUM  AVERAGE
machine1          1    12.64    12.64
machine2          1    12.64    12.64
machine3          4    50.77    12.69
machine4          3    38.03    12.68
machine5          5    63.45    12.69

讨论

在这个解决方案中，我创建了一个内存中的SQLite3数据库
因为我们只对name和time列感兴趣，所以表只包含这两个列。
我们只需使用数据库就可以免费获得所有的统计函数，如SUM、COUNT和AVG。

添加到第一个解决方案

要回答这个问题：给定machine5，如何获取最后一个值？这样，我假设您希望将数据筛选到包含machine5的数据，然后按时间排序并选择最后一行。对于第一个解决方案，附加以下代码块并运行它：

# Filter data: prints all rows with 'machine5'
print '\nFilter by machine5'
machine5 = [row for row in data if row['name'] == 'machine5']
machine5 = sorted(machine5, key=lambda row: int(row['time']))
pprint(machine5)

# Get the last instance
print '\nLast instance of machine5:'
latest_row = machine5[-1]
pprint(latest_row)

不要忘记在脚本开头添加以下内容：

from pprint import pprint

输出

Filter by machine5
[{u'name': u'machine5', u'time': 12.67007, u'value': 5.068},
 {u'name': u'machine5', u'time': 12.6801, u'value': 2.0868},
 {u'name': u'machine5', u'time': 12.6901, u'value': 12.633},
 {u'name': u'machine5', u'time': 12.69512, u'value': 13.13},
 {u'name': u'machine5', u'time': 12.71517, u'value': 131.633}]

Last instance of machine5:
{u'name': u'machine5', u'time': 12.71517, u'value': 131.633}

讨论

如果不想按时间对行进行排序，请删除sorted()行，这将为您提供未排序的输出。

网友

2楼 · 编辑于 2024-05-27 11:17:41

使每一行成为一个类（不是严格必要但很好），重载cmp并使用sort

class MachineInfo:

    def __init__(self, info_time, name, value):
        self.info_time = info_time
        self.name = name
        self.value = value

def cmp_machines(a, b):
    return cmp(a.name, b.name)

sort还有一个可选的比较函数。。

info = [... fill this with MachineInfo instances here ...]

# then call 
info = sorted(info, cmp_machines)

# or to sort in place
info.sort(cmp_machines)

# alternatively add a  __cmp__ method to MachineInfo and that will get used by default

有更奇特的方法。。https://wiki.python.org/moin/HowTo/Sorting 但保持简单明了是件好事。

第一种解决方案

讨论

第二种解决方案

输出

讨论

添加到第一个解决方案

输出

讨论

相关问题更多 >

编程相关推荐

热门问题

热门文章