如何在Apache Beam/Google Cloud数据流中通过多个ParDo转换处理本地文件上的操作

# singleton decorator def singleton(cls): instances = {} def getinstance(): if cls not in instances: instances[cls] = cls() return instances[cls] return getinstance @singleton class Predict(): def __init__(self, model): ''' Process audio, reads in filename Returns Prediction ''' self.model = model def process(self, filename): #simplified pseudocode audio = preprocess.load(filename=filename) prediction = inference(self.model, audio) return prediction class PredictDoFn(beam.DoFn): def __init__(self, model): self.localfile, self.model = "", model def process(self, element): # Construct Predict() object singleton per worker predict = Predict(self.model) subprocess.run(['gsutil','cp',element['GCSPath'],'./'], cwd=cwd, shell=False) self.localfile = cwd + "/" + element['GCSPath'].split('/')[-1] res = predict.process(self.localfile) return [{ 'Index': element['Index'], 'Title': element['Title'], 'File' : element['GCSPath'], self.model + 'Prediction': res }] def finish_bundle(self): subprocess.run(['rm',self.localfile], cwd=cwd, shell=False) # DoFn to split csv into elements (GSC bucket could be read as a PCollection instead maybe) class Split(beam.DoFn): def process(self, element): Index,Title,GCSPath = element.split(",") GCSPath = 'gs://mybucket/'+ GCSPath return [{ 'Index': int(Index), 'Title': Title, 'GCSPath': GCSPath }]

with beam.Pipeline(argv=pipeline_args) as p: files = ( p | 'Read From CSV' >> beam.io.ReadFromText(known_args.input) | 'Parse CSV into Dict' >> beam.ParDo(Split()) ) # prediction 1 branch preds1 = ( files | 'Prediction 1' >> beam.ParDo(PredictDoFn(model1)) ) # prediction 2 branch preds2 = ( files | 'Prediction 2' >> beam.ParDo(PredictDoFn(model2)) ) # join branches joined = { preds1, preds2 } # output to file output = ( joined | 'WriteToText' >> beam.io.Write(beam.io.textio.WriteToText(known_args.output)) )

1条回答

网友

1楼 · 发布于 2024-09-30 04:33:29

为了避免重复下载文件，可以将文件内容放入pCollection

class DownloadFilesDoFn(beam.DoFn):
  def __init__(self):
     import re
     self.gcs_path_regex = re.compile(r'gs:\/\/([^\/]+)\/(.*)')

  def start_bundle(self):
     import google.cloud.storage
     self.gcs = google.cloud.storage.Client()

  def process(self, element):
     file_match = self.gcs_path_regex.match(element['GCSPath'])
     bucket = self.gcs.get_bucket(file_match.group(1))
     blob = bucket.get_blob(file_match.group(2))
     element['file_contents'] = blob.download_as_bytes()
     yield element

然后PredictDoFn变成：

class PredictDoFn(beam.DoFn):
  def __init__(self, model):
    self.model = model

  def start_bundle(self):
    self.predict = Predict(self.model)
    
  def process(self, element):
    res = self.predict.process(element['file_contents'])
    return [{
        'Index': element['Index'], 
        'Title': element['Title'],
        'File' : element['GCSPath'],
        self.model + 'Prediction': res
        }]

管道：

with beam.Pipeline(argv=pipeline_args) as p:
    files = 
        ( 
        p | 'Read From CSV' >> beam.io.ReadFromText(known_args.input)
          | 'Parse CSV into Dict' >> beam.ParDo(Split())
          | 'Read files' >> beam.ParDo(DownloadFilesDoFn())
        )
    # prediction 1 branch
    preds1 = 
        (
          files | 'Prediction 1' >> beam.ParDo(PredictDoFn(model1))
        )
    # prediction 2 branch
    preds2 = 
        (
          files | 'Prediction 2' >> beam.ParDo(PredictDoFn(model2))
        )
    
    # join branches
    joined = { preds1, preds2 }

    # output to file
    output = 
        ( 
      joined | 'WriteToText' >> beam.io.Write(beam.io.textio.WriteToText(known_args.output))
        )

相关问题更多 >

编程相关推荐

热门问题

热门文章