https://blog.tensorflow.org/2020/03/tensorflow-extended-tfx-using-apache-beam-large-scale-data-processing.html

https://4.bp.blogspot.com/-q0yJ7TRXzAo/XmajPFevFAI/AAAAAAAACyE/5rRb42zAZecTPeEC5Uri_F50Iojp34EZACLcBGAsYHQ/s1600/beam-reza.png

March 10, 2020 — Posted by By Reza Rokni, Developer Advocate Google Cloud, on behalf of the TFX and Dataflow teams





TFX core mission is to allow models to be moved from research to production, creating and managing production pipelines. Many models will be built using large volumes of data, requiring multiple hosts working in parallel to serve both the processing and serving needs of your production pipelines.



Us…

TensorFlow Extended (TFX): Using Apache Beam for large scale data processing

Apache Beam

Apache Beam Benefits

AnalyzeAndTransformDataset

ExampleGen

StatisticsGen

TFX Libraries

preprocessing_fn,

virtualenv tfx-beam --python=python3 source tfx-beam/bin/activate pip install tfx

def main(): with tft_beam.Context(temp_dir=tempfile.mkdtemp()): transformed_dataset, transform_fn = ( (raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset( preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset print('

Raw data:

{}

'.format(pprint.pformat(raw_data))) print('Transformed data:

{}'.format(pprint.pformat(transformed_data))) if __name__ == '__main__': main()

result = pass_this | 'name this step' >> to_this_call

to_this_call

pass_this

beam_impl.Context

beam.Pipeline

--runner

--runner

import apache_beam as beam argv=['--runner=DirectRunner'] def main(): with beam.Pipeline(argv=argv) as p: # Ignore the warnings with beam_impl.Context(temp_dir=tempfile.mkdtemp()): input = p | beam.Create(raw_data) transformed_dataset, transform_fn = ( (input, raw_data_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_dataset[0] |"Print Transformed Dataset" >> beam.Map(print) if __name__ == '__main__': main()

# Setup our Environment ## The location of Input / Output between various stages ( TFX Components ) ## This will also be the location for the Metadata ### Can be used when running the pipeline locally #LOCAL_PIPELINE_ROOT = ### In production you want the input and output to be stored on non-local location #GOOGLE_CLOUD_STORAGE_PIPELINE_ROOT= #GOOGLE_CLOUD_PROJECT = #GOOGLE_CLOUD_TEMP_LOCATION = # Will need setup.py to make this work with Dataflow # # import setuptools # # setuptools.setup( # name='demo', # version='0.0', # install_requires=['tfx==0.21.1'], # packages=setuptools.find_packages(),) SETUP_FILE = "./setup.py" argv=['--project={}'.format(GOOGLE_CLOUD_PROJECT), '--temp_location={}'.format(GOOGLE_CLOUD_TEMP_LOCATION), '--setup_file={}'.format(SETUP_FILE), '--runner=DataflowRunner'] def main(): with beam.Pipeline(argv=argv) as p: with beam_impl.Context(temp_dir=GOOGLE_CLOUD_TEMP_LOCATION): input = p | beam.Create(raw_data) transformed_data, transformed_metadata = ( (input, raw_data_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) if __name__ == '__main__': main()

Using TFX Components with Beam

ExampleGen

Split data into training and evaluation sets (by default, 2/3 training + 1/3 eval)

Convert data into the tf.Example format

Copy data into the _tfx_root directory for other components to access, for other components to access



def createExampleGen(query: Text): # Output 2 splits: train:eval=3:1. output = example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split( name='train', hash_buckets=3), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1) ])) return BigQueryExampleGen(query=query, output_config=output)

bigquery-public-data.chicago_taxi_trips.taxi_trips.

query=""" SELECT pickup_community_area, fare, EXTRACT(MONTH FROM trip_start_timestamp) trip_start_month, EXTRACT(HOUR FROM trip_start_timestamp) trip_start_hour, EXTRACT(DAYOFWEEK FROM trip_start_timestamp) trip_start_day, UNIX_Millis(trip_start_timestamp) trip_start_ms_timestamp, pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude, trip_miles, pickup_census_tract, dropoff_census_tract, payment_type, company, trip_seconds, dropoff_community_area, tips FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` LIMIT 100 """

StatisticsGen

def createStatisticsGen(bigQueryExampleGen: BigQueryExampleGen): # Computes statistics over data for visualization and example validation. return StatisticsGen(examples=bigQueryExampleGen.outputs['examples'])

Pipeline Orchestration

BeamDagRunner

# Used for setting up the orchestration from tfx.orchestration import pipeline from tfx.orchestration import metadata from tfx.orchestration.beam.beam_dag_runner import BeamDagRunner

from typing import Text from typing import Type def createTfxPipeline(pipeline_name: Text, pipeline_root: Text, query: Text, beam_pipeline_args) -> pipeline.Pipeline: output = example_gen_pb2.Output( # Output 2 splits: train:eval=3:1. split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=3), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1) ])) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = BigQueryExampleGen(query=query, output_config=output) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen ], metadata_connection_config=metadata.sqlite_metadata_connection_config( os.path.join(".", 'metadata', pipeline_name,'metadata.db')), enable_cache=False, additional_pipeline_args=beam_pipeline_args)

LIMIT 100

tfx_pipeline = createTfxPipeline( pipeline_name="my_first_directRunner_pipeline", pipeline_root=LOCAL_PIPELINE_ROOT, query=query, beam_pipeline_args= { 'beam_pipeline_args':[ '--project={}'.format(GOOGLE_CLOUD_PROJECT), '--runner=DirectRunner']}) BeamDagRunner().run(tfx_pipeline)

LOCAL_PIPELINE_ROOT

import os import tensorflow_data_validation as tfdv stats = tfdv.load_statistics(os.path.join(LOCAL_PIPELINE_ROOT,"StatisticsGen","statistics"," ","train","stats_tfrecord")) tfdv.visualize_statistics(stats)

tfx_pipeline = createTfxPipeline( pipeline_name="my_first_dataflowRunner_pipeline", pipeline_root=GOOGLE_CLOUD_STORAGE_PIPELINE_ROOT, query=query, beam_pipeline_args={ 'beam_pipeline_args':[ '--project={}'.format(GOOGLE_CLOUD_PROJECT) , '--temp_location={}'.format(GOOGLE_CLOUD_TEMP_LOCATION), '--setup_file=./setup.py', '--runner=DataflowRunner']}) BeamDagRunner().run(tfx_pipeline)

BeamDagRunner

ExampleGen

StatisticsGen

num_*_feature

Summary

For more information