BigQuery Python:google.api_core.exceptions.BadRequest:400读取数据时出错,错误消息:架构不匹配:引用变量'ro_sub_ros.$is_not_null'的数组级别为1,而拼花地板列的对应字段路径重复0 字段。
我的原始数据如下所示:
testData = {
"ro_user_email": "tech@techietech.com",
"ro_account_id": "23402042",
"ro_sub_account_id": "34020334",
"ro_name": "Test RO",
"ro_number": "1304340",
"ro_currency": {"label":"USD","value":"USD"},
"ro_dates": {"from":now,"to":now},
"ro_status": "draft",
"ro_operation_timestamp": pd.Timestamp(now),
"ro_billing_cycle": {"label":"Fortnightly","value":"Fortnightly"},
"ro_sub_ros": [
{
"sub_ro_id": "2323",
"valid":False,
"sub_ro_name": "Testing",
"sub_ro_dates":{"from":now,"to":now},
"sub_ro_budget": 1203302.22,
"sub_ro_revenue_price":1202302.22,
"sub_ro_revenue_selected": {"label":"Fortnightly","value":"Fortnightly"},
"sub_ro_revenue_model_selected": {"label":"Fortnightly","value":"Fortnightly"},
"sub_ro_campaigns_selected": [{"label":"Fortnightly","value":"Fortnightly"}],
"sub_ro_ios_selected": [{"label":"Fortnightly","value":"Fortnightly"}],
"sub_ro_client_id": [{"label":"Fortnightly","value":"Fortnightly"}],
"sub_ro_ids_selected": [{"label":"Fortnightly","value":"Fortnightly"}],
"sub_ro_pixels_selected": [{"label":"Fortnightly","value":"Fortnightly"}],
"kpi_1_metric_selected": {"label":"Fortnightly","value":"Fortnightly"},
"attribution_model_selected": {"label":"Fortnightly","value":"Fortnightly"},
"kpi_window_selected": {"label":"Fortnightly","value":"Fortnightly"},
"deepMetrics_selected": {"label":"Fortnightly","value":"Fortnightly"},
"sub_ro_kpi_goal":"ROI"
}
],
}
下面是我如何创建BQ模式的:
schema = [
bigquery.SchemaField("ro_user_email", "STRING", mode="REQUIRED"),
bigquery.SchemaField("ro_account_id", "STRING", mode="REQUIRED"),
bigquery.SchemaField("ro_sub_account_id", "STRING", mode="REQUIRED"),
bigquery.SchemaField("ro_name", "STRING", mode="REQUIRED"),
bigquery.SchemaField("ro_number", "STRING", mode="REQUIRED"),
bigquery.SchemaField("ro_currency",
"STRUCT",
mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
),
bigquery.SchemaField("ro_dates",
"STRUCT",
mode="REQUIRED",
fields=[
bigquery.SchemaField("from", "DATE", mode="REQUIRED"),
bigquery.SchemaField("to", "DATE", mode="REQUIRED"),
]
),
bigquery.SchemaField("ro_status","STRING", mode="REQUIRED"),
bigquery.SchemaField("ro_operation_timestamp","TIMESTAMP", mode="REQUIRED"),
bigquery.SchemaField("ro_billing_cycle",
"STRUCT",
mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
),
bigquery.SchemaField(
"ro_sub_ros",
"RECORD",
mode="REPEATED",
fields=[
bigquery.SchemaField("sub_ro_id", "STRING", mode="REQUIRED"),
bigquery.SchemaField("valid", "BOOL", mode="REQUIRED"),
bigquery.SchemaField("sub_ro_name", "STRING", mode="REQUIRED"),
bigquery.SchemaField("sub_ro_dates", "STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("from", "DATE", mode="REQUIRED"),
bigquery.SchemaField("to", "DATE", mode="REQUIRED"),
]
),
bigquery.SchemaField("sub_ro_budget", "FLOAT", mode="REQUIRED"),
bigquery.SchemaField("sub_ro_revenue_price", "FLOAT", mode="REQUIRED"),
bigquery.SchemaField("sub_ro_revenue_selected",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
),
bigquery.SchemaField("sub_ro_revenue_model_selected",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
),
bigquery.SchemaField("sub_ro_campaigns_selected","RECORD",
mode="REPEATED",
fields=[
bigquery.SchemaField("model_list",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
)
]),
bigquery.SchemaField("sub_ro_ios_selected","RECORD",
mode="REPEATED",
fields=[
bigquery.SchemaField("model_list",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
)
]),
bigquery.SchemaField("sub_ro_client_id","RECORD",
mode="REPEATED",
fields=[
bigquery.SchemaField("model_list",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
)
]),
#
bigquery.SchemaField("sub_ro_ids_selected","RECORD",
mode="REPEATED",
fields=[
bigquery.SchemaField("model_list",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
)
]),
bigquery.SchemaField("sub_ro_pixels_selected","RECORD",
mode="REPEATED",
fields=[
bigquery.SchemaField("model_list",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
)
]),
bigquery.SchemaField("kpi_1_metric_selected",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
),
bigquery.SchemaField("attribution_model_selected",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
),
bigquery.SchemaField("kpi_window_selected",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
),
bigquery.SchemaField("deepMetrics_selected",
"STRUCT", mode="REQUIRED",
fields=[
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("value", "STRING", mode="REQUIRED"),
]
),
bigquery.SchemaField("sub_ro_kpi_goal", "STRING", mode="REQUIRED"),
],
)
]
当我尝试使用bigquery client library
上载此数据时,我遇到以下错误:
job_config = bigquery.LoadJobConfig(schema=schema)
return bq.client.load_table_from_dataframe(
df, tablename, job_config=job_config
).result()
抛出:
google.api_core.exceptions.BadRequest: 400 Error while reading data, error message: Schema mismatch: referenced variable 'ro_sub_ros.$is_not_null' has array levels of 1, while the corresponding field path to Parquet column has 0 repeated
fields.
不确定这里出了什么问题,如果我的模式太大太笨重,无法分析,有人能给出一个使用客户端库和熊猫数据框在google bigquery中上传REPEATED RECORD
的最小示例吗?
使用重复记录验证BigQuery模式是否正确,这是一个示例。你可以看到官方的documentation
验证记录语法是否正确Here是模式值的示例
考虑使用Python代码中的“AutoDebug Schema”。与此示例类似。你可以看到更多的documentation
您可以在this page中验证JSON格式
相关问题 更多 >
编程相关推荐