在pyspark数据帧中插入数据时出错

def func(row): temp=row.asDict() headDict = {} headDict['type'] = "record" headDict['name'] = "source" headDict['namespace'] = "com.streaming.event" headDict['doc'] = "SCD signals from source" fieldslist = [] headDict['fields'] = fieldslist for i in temp: fieldslist.append({i:temp[i]}) return (json.dumps(headDict)) if __name__ == "__main__": spark = SparkSession.builder.master("local[*]").appName("PythonWordCount").getOrCreate() payload=udf(func,StringType()) data = spark.createDataFrame( [ (1, "a", 'foo1'), # create your data here, be consistent in the types. (2, "b", 'bar'), (3, "c", 'mnc') ], ['id', 'nm', 'txt'] # add your columns label here ) df=data.withColumn("payload1",payload(struct([data[x] for x in data.columns]))) df.show(3,False)

raise ValueError("Unexpected tuple %r with StructType" % obj) ValueError: Unexpected tuple '{"namespace": "com.streaming.event", "type": "record", "name": "source", "fields": [{"txt": "mnc"}, {"id": 3}, {"nm": "c"}], "doc": "SCD signals from source"}' with StructType

1条回答

网友

1楼 · 发布于 2024-09-28 22:23:47

它在spark 3.x和python 2.7.x中适用于我

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 3.0.0
      /_/

Using Python version 2.7.17 (default, Jul 20 2020 15:37:01)
SparkSession available as 'spark'.

Pypark shell的结果

import json
from pyspark.sql.functions import * 
from pyspark.sql.types import *

def func(row):
    temp=row.asDict()
    headDict = {}
    headDict['type'] = "record"
    headDict['name'] = "source"
    headDict['namespace'] = "com.streaming.event"
    headDict['doc'] = "SCD signals from  source"
    fieldslist = []
    headDict['fields'] = fieldslist
    for i in temp:
        fieldslist.append({i:temp[i]})
    return (json.dumps(headDict))

spark = SparkSession.builder.master("local[*]").appName("PythonWordCount").getOrCreate()
payload=udf(func,StringType())
data = spark.createDataFrame([(1, "a", 'foo1'),     (2, "b", 'bar'),    (3, "c", 'mnc')],['id', 'nm', 'txt'])
data.show()
'''
+ -+ -+  +                                                                  
| id| nm| txt|
+ -+ -+  +
|  1|  a|foo1|
|  2|  b| bar|
|  3|  c| mnc|
+ -+ -+  +
'''


df=data.withColumn("payload1",payload(struct([data[x] for x in data.columns])))
df.show(3,False)
'''
+ -+ -+  +                                                                                +
|id |nm |txt |payload1                                                                                                                                                        |
+ -+ -+  +                                                                                +
|1  |a  |foo1|{"namespace": "com.streaming.event", "type": "record", "name": "source", "fields": [{"txt": "foo1"}, {"id": 1}, {"nm": "a"}], "doc": "SCD signals from  source"}|
|2  |b  |bar |{"namespace": "com.streaming.event", "type": "record", "name": "source", "fields": [{"txt": "bar"}, {"id": 2}, {"nm": "b"}], "doc": "SCD signals from  source"} |
|3  |c  |mnc |{"namespace": "com.streaming.event", "type": "record", "name": "source", "fields": [{"txt": "mnc"}, {"id": 3}, {"nm": "c"}], "doc": "SCD signals from  source"} |
+ -+ -+  +                                                                                +
'''

相关问题更多 >

编程相关推荐

热门问题

热门文章