我在Jupyter笔记本中运行PySpark,试图加载大量大型JSON文件。我以前用一个文件(11gb大小)测试过同一个脚本,它工作时没有任何问题。在
我的Python脚本:
import os, sys, pandas, time
import findspark
findspark.init('/opt/cloudera/parcels/SPARK2/lib/spark2')
import pyspark
os.environ['PYSPARK_PYTHON'] = "/opt/cloudera/Anaconda3/bin/python"
from pyspark import SparkConf, SparkContext, StorageLevel
from pyspark.sql.types import *
from pyspark.sql import Row, SparkSession, HiveContext, SQLContext
target_directory = "removed for this post"
top_directory = "removed for this post"
sc_conf = SparkConf()
sc_conf.setAppName("test"),
sc_conf.set('spark.ui.port', 49051)
sc_conf.set('spark.executor.memory', '18g'),
sc_conf.set('spark.executor.cores', '4')
sc_conf.set('spark.executor.extraJavaOptions', '-XX:+UseG1GC'),
sc_conf.set('spark.driver.memory', '18g'),
sc_conf.set('spark.yarn.am.memory', '8g'),
sc_conf.set('spark.yarn.am.cores', '4'),
sc_conf.set('spark.task.cpus','1'),
sc_conf.set('spark.serializer','org.apache.spark.serializer.KryoSerializer'),
ses = (
SparkSession
.builder
.config(conf=sc_conf)
.enableHiveSupport()
.getOrCreate()
)
sc = ses.sparkContext
start_time = time.time()
sqlContext = HiveContext(ses)
print("LOADING JSON DF")
start_time_json = time.time()
jsonDF = sqlContext.read.json(top_directory + "traceroute*")
print("SUCCESS")
print("Time elapsed: " + str(time.time() - start_time_json) + " seconds")
print("\n")
jsonDF.printSchema()
jsonDF.show(3)
ses.sql("USE my_table_name")
start_time_orc = time.time()
print("WRITING ORC")
#jsonDF.write.format("orc").saveAsTable("main_orc.json")
jsonDF.write.orc("../traceroute_orc")
print("SUCCESS")
print("Time elapsed: " + str(time.time() - start_time_orc) + " seconds")
#print(time.time() - start_time_orc)
print("\n")
start_time_parquet = time.time()
print("WRITING PARQUET")
jsonDF.write.parquet("../traceroute_parquet")
print("SUCCESS")
print("Time elapsed: " + str(time.time() - start_time_parquet) + " seconds")
print("\n")
print("Total time elapsed: " + str(time.time() - start_time) + " seconds")
sc.stop()
然后我得到一个错误:
^{pr2}$以下是一个对象的JSON模式:https://pastebin.com/W6nHuJuL
为什么它说这些列中有重复项?正如您从我发布的JSON中可以看到的,这里没有重复。在
目前没有回答
相关问题 更多 >
编程相关推荐