使用Python以JSON格式提取/格式化数据的最佳方法?

2024-05-03 09:23:16 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在尝试对批量专利数据进行一些数据分析(通常可以在这里找到数据,但目前数据下降了)

以下是JSON文件中的第一个条目:

{
  "PatentBulkData":[
    {
      "patentCaseMetadata":{
        "applicationNumberText":{
          "value":"15733015",
          "electronicText":"15733015"
        },
        "filingDate":"2020-01-01",
        "applicationTypeCategory":"Utility",
        "partyBag":{
          "applicantBagOrInventorBagOrOwnerBag":[
            {
              "applicant":[
                {
                  "contactOrPublicationContact":[
                    {
                      "name":{
                        "personNameOrOrganizationNameOrEntityName":[
                          {
                            "personStructuredName":{
                              "firstName":"Birol",
                              "middleName":"",
                              "lastName":"Cimen"
                            }
                          }
                        ]
                      },
                      "cityName":"Hengelo",
                      "geographicRegionName":{
                        "value":"",
                        "geographicRegionCategory":"STATE"
                      },
                      "countryCode":"NL"
                    }
                  ]
                }
              ]
            },
            {
              "partyIdentifierOrContact":[
                {
                  "name":{
                    "personNameOrOrganizationNameOrEntityName":[
                      {
                        "personStructuredName":{
                          "lastName":"Oppedahl Patent Law Firm LLC (Mink)"
                        }
                      }
                    ]
                  },
                  "postalAddressBag":{
                    "postalAddress":[
                      {
                        "postalStructuredAddress":{
                          "addressLineText":[
                            {
                              "value":"P O Box 351240"
                            }
                          ],
                          "cityName":"Westminster",
                          "geographicRegionName":[
                            {
                              "value":"CO"
                            }
                          ],
                          "countryCode":"US",
                          "postalCode":"80035"
                        }
                      }
                    ]
                  }
                },
                {
                  "value":"133517"
                }
              ]
            }
          ]
        },
        "groupArtUnitNumber":{
          "value":"3771",
          "electronicText":"3771"
        },
        "applicationConfirmationNumber":"7897",
        "applicantFileReference":"FP01.P035 SST02US",
        "priorityClaimBag":{
          "priorityClaim":[
            {
              "ipOfficeName":"NETHERLANDS",
              "applicationNumber":{
                "applicationNumberText":"2019179"
              },
              "filingDate":"2017-07-05",
              "sequenceNumber":"1"
            }
          ]
        },
        "patentClassificationBag":{
          "cpcClassificationBagOrIPCClassificationOrECLAClassificationBag":[
            {
              "ipOfficeCode":"US",
              "mainNationalClassification":{
                "nationalClass":"606",
                "nationalSubclass":"133000"
              }
            }
          ]
        },
        "businessEntityStatusCategory":"SMALL",
        "firstInventorToFileIndicator":"true",
        "inventionTitle":{
          "content":[
            "Hair removal device for removing body hair on a body surface"
          ]
        },
        "applicationStatusCategory":"Application Dispatched from Preexam, Not Yet Docketed",
        "applicationStatusDate":"2020-05-08",
        "officialFileLocationCategory":"ELECTRONIC",
        "patentPublicationIdentification":{
          "publicationNumber":"US20200170371A1",
          "publicationDate":"2020-06-04"
        },
        "relatedDocumentData":{
          "parentDocumentDataOrChildDocumentData":[
            {
              "descriptionText":"This application is National Stage Entry of",
              "applicationNumberText":"PCT/NL2018/050434",
              "filingDate":"2018-07-04",
              "parentDocumentStatusCode":"Published",
              "patentNumber":""
            }
          ]
        }
      },
      "prosecutionHistoryDataBag":{
        "prosecutionHistoryData":[
          {
            "eventDate":"2020-06-05",
            "eventCode":"PG-ISSUE",
            "eventDescriptionText":"PG-Pub Issue Notification"
          },
          {
            "eventDate":"2020-05-11",
            "eventCode":"M903",
            "eventDescriptionText":"Notice of DO/EO Acceptance Mailed"
          },
          {
            "eventDate":"2020-05-11",
            "eventCode":"FLRCPT.U",
            "eventDescriptionText":"Filing Receipt - Updated"
          },
          {
            "eventDate":"2020-05-11",
            "eventCode":"MPEN",
            "eventDescriptionText":"Mail Pre-Exam Notice"
          },
          {
            "eventDate":"2020-02-26",
            "eventCode":"EML_NTR",
            "eventDescriptionText":"Email Notification"
          },
          {
            "eventDate":"2020-02-26",
            "eventCode":"EML_NTR",
            "eventDescriptionText":"Email Notification"
          },
          {
            "eventDate":"2020-02-26",
            "eventCode":"CCRDY",
            "eventDescriptionText":"Application ready for PDX access by participating foreign offices"
          },
          {
            "eventDate":"2020-01-05",
            "eventCode":"371COMP",
            "eventDescriptionText":"371 Completion Date"
          },
          {
            "eventDate":"2020-02-25",
            "eventCode":"PGPC",
            "eventDescriptionText":"Sent to Classification Contractor"
          },
          {
            "eventDate":"2020-02-25",
            "eventCode":"FTFS",
            "eventDescriptionText":"FITF set to YES - revise initial setting"
          },
          {
            "eventDate":"2020-01-02",
            "eventCode":"PTA.RFE",
            "eventDescriptionText":"Patent Term Adjustment - Ready for Examination"
          },
          {
            "eventDate":"2020-02-26",
            "eventCode":"FLRCPT.O",
            "eventDescriptionText":"Filing Receipt"
          },
          {
            "eventDate":"2020-02-26",
            "eventCode":"M903",
            "eventDescriptionText":"Notice of DO/EO Acceptance Mailed"
          },
          {
            "eventDate":"2019-12-31",
            "eventCode":"SREXR141",
            "eventDescriptionText":"PTO/SB/69-Authorize EPO Access to Search Results"
          },
          {
            "eventDate":"2019-12-31",
            "eventCode":"APPERMS",
            "eventDescriptionText":"Applicants have given acceptable permission for participating foreign "
          },
          {
            "eventDate":"2020-02-25",
            "eventCode":"SMAL",
            "eventDescriptionText":"Applicant Has Filed a Verified Statement of Small Entity Status in Compliance with 37 CFR 1.27"
          },
          {
            "eventDate":"2019-12-31",
            "eventCode":"L194",
            "eventDescriptionText":"Cleared by OIPE CSR"
          },
          {
            "eventDate":"2019-12-31",
            "eventCode":"WIDS",
            "eventDescriptionText":"Information Disclosure Statement (IDS) Filed"
          },
          {
            "eventDate":"2019-12-31",
            "eventCode":"WIDS",
            "eventDescriptionText":"Information Disclosure Statement (IDS) Filed"
          },
          {
            "eventDate":"2019-12-31",
            "eventCode":"BIG.",
            "eventDescriptionText":"ENTITY STATUS SET TO UNDISCOUNTED (INITIAL DEFAULT SETTING OR STATUS CHANGE)"
          },
          {
            "eventDate":"2019-12-31",
            "eventCode":"IEXX",
            "eventDescriptionText":"Initial Exam Team nn"
          }
        ]
      },
      "st96Version":"V3_1",
      "ipoVersion":"US_V8_0"
    },

我将json数据作为字典导入。但是,获取我想要检索的信息的最佳方式是什么。我应该使用json.normalize将其展平并转换为数据帧吗

我想特别检索“起诉历史数据”中的信息。例如,对于其他专利申请,这将提供关于已经发布了多少办公室行动的具体信息

最后,我想交叉引用专利审查员的办公室行动数据(当分配给审查员时,可以在“AppliantBagorInventorBargorOwnerBag”中找到)

有没有好的资源来解释如何清理json数据,这样我就可以将这些信息分成不同的列

谢谢你提供的信息!以下是一位考官的例子:

   {
         "patentCaseMetadata":{
            "applicationNumberText":{
               "value":"16732312",
               "electronicText":"16732312"
            },
            "filingDate":"2020-01-01",
            "applicationTypeCategory":"Utility",
            "partyBag":{
               "applicantBagOrInventorBagOrOwnerBag":[
                  {
                     "primaryExaminerOrAssistantExaminerOrAuthorizedOfficer":[
                        {
                           "name":{
                              "personNameOrOrganizationNameOrEntityName":[
                                 {
                                    "personFullName":"ORGAD, EDAN"
                                 }
                              ]
                           }
                        }
                     ]
                  },
                  {
                     "applicant":[
                        {
                           "contactOrPublicationContact":[
                              {
                                 "name":{
                                    "personNameOrOrganizationNameOrEntityName":[
                                       {
                                          "organizationStandardName":{
                                             "content":[
                                                "Communication Systems LLC"
                                             ]
                                          }
                                       }
                                    ]
                                 },
                                 "cityName":"Santa Fe",
                                 "geographicRegionName":{
                                    "value":"NM",
                                    "geographicRegionCategory":"STATE"
                                 },
                                 "countryCode":""
                              }
                           ]
                        }
                     ]
                  }
               ]
            },
            "groupArtUnitNumber":{
               "value":"2414",
               "electronicText":"2414"
            },
            "applicationConfirmationNumber":"8996",
            "applicantFileReference":"CS1003US03",
            "patentClassificationBag":{
               "cpcClassificationBagOrIPCClassificationOrECLAClassificationBag":[
                  {
                     "ipOfficeCode":"US",
                     "mainNationalClassification":{
                        "nationalClass":"370",
                        "nationalSubclass":"329000"
                     }
                  }
               ]
            },
            "businessEntityStatusCategory":"SMALL",
            "firstInventorToFileIndicator":"true",
            "inventionTitle":{
               "content":[
                  "APPARATUSES, METHODS, AND COMPUTER-READABLE MEDIUM FOR COMMUNICATION IN A WIRELESS LOCAL AREA NETWORK"
               ]
            },
            "applicationStatusCategory":"Docketed New Case - Ready for Examination",
            "applicationStatusDate":"2020-02-07",
            "officialFileLocationCategory":"ELECTRONIC",
            "patentPublicationIdentification":{
               "publicationNumber":"US20200154403A1",
               "publicationDate":"2020-05-14"
            }
         },
         "prosecutionHistoryDataBag":{
            "prosecutionHistoryData":[
               {
                  "eventDate":"2020-05-19",
                  "eventCode":"PG-ISSUE",
                  "eventDescriptionText":"PG-Pub Issue Notification"
               }
            ]
         },
         "assignmentDataBag":{
            "assignmentData":[
               {
                  "reelNumber":"52436",
                  "frameNumber":"295",
                  "documentReceivedDate":"2020-04-20",
                  "recordedDate":"2020-04-20",
                  "mailDate":"2020-04-21",
                  "pageTotalQuantity":3,
                  "conveyanceText":"ASSIGNMENT OF ASSIGNORS INTEREST (SEE DOCUMENT FOR DETAILS).",
                  "assignorBag":{
                     "assignor":[
                        {
                           "executionDate":"2016-07-14",
                           "contactOrPublicationContact":[
                              {
                                 "name":{
                                    "personNameOrOrganizationNameOrEntityName":[
                                       {
                                          "value":"ATEFI, ALI"
                                       }
                                    ]
                                 }
                              }
                           ]
                        }
                     ]
                  },
                  "assigneeBag":{
                     "assignee":[
                        {
                           "contactOrPublicationContact":[
                              {
                                 "name":{
                                    "personNameOrOrganizationNameOrEntityName":[
                                       {
                                          "value":"COMMUNICATION SYSTEMS LLC"
                                       }
                                    ]
                                 },
                                 "postalAddressBag":{
                                    "postalAddress":[
                                       {
                                          "postalAddressText":[
                                             {
                                                "sequenceNumber":"1",
                                                "value":"530-B HARKLE ROAD"
                                             },
                                             {
                                                "sequenceNumber":"2",
                                                "value":"STE. 100"
                                             },
                                             {
                                                "sequenceNumber":"3",
                                                "value":"SANTA FE NEW MEXICO 87505"
                                             }
                                          ]
                                       }
                                    ]
                                 }
                              }
                           ]
                        }
                     ]
                  },
                  "correspondenceAddress":{
                     "partyIdentifierOrContact":[
                        {
                           "name":{
                              "personNameOrOrganizationNameOrEntityName":[
                                 {
                                    "value":"ALI ATEFI"
                                 }
                              ]
                           },
                           "postalAddressBag":{
                              "postalAddress":[
                                 {
                                    "postalAddressText":[
                                       {
                                          "sequenceNumber":"1",
                                          "value":"530-B HARKLE ROAD"
                                       },
                                       {
                                          "sequenceNumber":"2",
                                          "value":"STE. 100"
                                       },
                                       {
                                          "sequenceNumber":"3",
                                          "value":"SANTA FE, NM 87505"
                                       }
                                    ]
                                 }
                              ]
                           }
                        }
                     ]
                  },
                  "sequenceNumber":"1"
               }
            ],
            "assignmentTotalQuantity":1
         },
         "st96Version":"V3_1",
         "ipoVersion":"US_V8_0"
      },

我的解析将不会通过应用程序BagorInventorBagorOwnerBag。下面是我尝试获取考官名称的解析示例,它返回一个空数据帧:

jsonpath_expression = parse('PatentBulkData[*].patentCaseMetadata.partyBag.applicantBagOrInventorBagOrOwnerBag.primaryExaminerOrAssistantExaminerOrAuthorizedOfficer.name.personNameOrOrganizationNameOrEntityName.personFullName[*]')

如果我以applicationBagorInventorBargoRownerBag结尾,我将返回一个包含正确信息的数据帧——只包含括号和所有其他JSON符号。我是否缺少关键结构

再次感谢


Tags: 数据name信息forvalueuseventdatesequencenumber
1条回答
网友
1楼 · 发布于 2024-05-03 09:23:16

对于解析或多或少复杂的JSON文档,您可能需要看一下JSONPath“查询语言”

^{}中有一个很好的Python实现。因为您需要的数据是这样嵌套的

{
  "PatentBulkData": [
    {
      "prosecutionHistoryDataBag": {
        "prosecutionHistoryData": [
          {
            "eventDate": "2020-06-05",
            "eventCode": "PG-ISSUE",
            "eventDescriptionText": "PG-Pub Issue Notification"
          },

JSONPath将是

Under key PatentBulkData, get the every element of the array, then the key prosecutionHistoryDataBag, then the key prosecutionHistoryData, and finally all array elements under that.

PatentBulkData[*].prosecutionHistoryDataBag.prosecutionHistoryData[*]

这就是您在Python中要做的

import json

from jsonpath_rw import jsonpath, parse
import pandas as pd

# Parse the string containing the whole JSON document
data = json.loads(<YOUR_JSON_STRING>)

jsonpath_expr = parse('PatentBulkData[*].prosecutionHistoryDataBag.prosecutionHistoryData[*]')

# Extract the raw value from each matching element,
# i.e. every element of the JSON array
matches = [match.value for match in jsonpath_expr.find(data)]

# Create dataframe from the list of dictionaries
df = pd.DataFrame.from_records(matches)

结果:

| eventDate   | eventCode   | eventDescriptionText              |
|      -|:      |:                 |
| 2020-06-05  | PG-ISSUE    | PG-Pub Issue Notification         |
| 2020-05-11  | M903        | Notice of DO/EO Acceptance Mailed |
| 2020-05-11  | FLRCPT.U    | Filing Receipt - Updated          |
| 2020-05-11  | MPEN        | Mail Pre-Exam Notice              |
| 2020-02-26  | EML_NTR     | Email Notification                |

编辑

对于检查者查询,您需要注意嵌套数组。每次访问树中的数组时,都需要获取一个([0][1],等等)或数组中的所有元素([*]):

examiner_expr = parse(
    "PatentBulkData[*].patentCaseMetadata.partyBag"
    ".applicantBagOrInventorBagOrOwnerBag[*]"
    ".primaryExaminerOrAssistantExaminerOrAuthorizedOfficer[*]"
    ".name.personNameOrOrganizationNameOrEntityName[*]"
    ".personFullName"
)
[match.value for match in examiner_expr.find(data)]                                                                                                  
# ['ORGAD, EDAN']

相关问题 更多 >