import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'newacc_gcp_credential.json'

query = """
    SELECT date, store_nbr, family, sales, onpromotion
    FROM `scalable-model-piplines.store_sales.simplified_data_table` 
    limit 100    
"""


from google.cloud import bigquery

client = bigquery.Client()

train_data = client.query(query).to_dataframe()
train_data.head()

def upload_File_to_Cloud_Storage(src_dir: str, gcs_dst: str, recursive=False):
    fs = gcsfs.GCSFileSystem()
    fs.put(src_dir, gcs_dst, recursive=recursive)

bucket_name = "gcs://scalable-model-piplines-trained_model/"

upload_File_to_Cloud_Storage(path + model_name + today, 
                             bucket_name + model_name + today, recursive=True)

upload_File_to_Cloud_Storage(path + dp_name + today + '.pkl', 
                             bucket_name + dp_name + today + '.pkl')

$ docker image build -t "prediction_service" .

$ docker run -d -p 8080:80 prediction_service

$ docker tag prediction_service us.gcr.io/scalable-model-piplines/prediction_service

$ docker push us.gcr.io/scalable-model-piplines/prediction_service


import requests
result = requests.get("http://34.69.150.86/run")
print(result.json())

{'success': True}

def main():
    import requests
    result = requests.get("http://34.69.150.86/run")

from datetime import datetime, timedelta
from airflow.models import DAG
from airflow.operators.python_operator import PythonOperator
from tasks import task1_run_Model_Trainer, task2_upload_Model

default_dag_args = {}

with DAG(
    dag_id="train_model",
    default_args=default_dag_args,
    start_date=datetime(2023, 2, 25, 0, 0),
    # schedule_interval=timedelta(days=1), # every day
    # At 08:00 AM  every day
    schedule_interval="0 8 * * *",
) as dag:
    do_stuff1 = PythonOperator(
        task_id="task_1",
        python_callable=task1_run_Model_Trainer.main,  # entrypoint is main()
    )
    do_stuff2 = PythonOperator(
        task_id="task_2",
        python_callable=task2_upload_Model.main,  # assume entrypoint is main()
    )
    do_stuff1 >> do_stuff2

job_config = bigquery.QueryJobConfig(
    priority=bigquery.QueryPriority.BATCH
)
query_job = client.query(sql, job_config=job_config)  # Make an API request.
while(query_job.state != 'DONE'):
    # DO Somthing
    query_job = client.get_job(
        query_job.job_id, location=query_job.location
    )  # Make a next API request.

import time
from pyspark.context import SparkContext
from pyspark.ml.linalg import Vectors
from pyspark.ml.regression import LinearRegression
from pyspark.sql.session import SparkSession


def vector_from_inputs(r):
    """Define a function that collects the features of interest
    (date, store_nbr, and family) into a vector.
    Package the vector in a tuple containing the label (`sales`) for that row.
    """
    return (
        float(r["sales"]),
        Vectors.dense(
            time.mktime(r["date"].timetuple()),
            float(r["store_nbr"]),
        ),
    )


sc = SparkContext()
spark = SparkSession(sc)

# Read the data from BigQuery as a Spark Dataframe.
sales_data = (
    spark.read.format("bigquery")
    .option("project", "scalable-model-piplines")
    .option("table", "store_sales.simplified_data_table")
    .load()
)
# Create a view so that Spark SQL queries can be run against the data.
sales_data.createOrReplaceTempView("sales_data")

query = """
    SELECT date, store_nbr, family, sales
    FROM `sales_data` 
"""
clean_data = spark.sql(query)
# Create an input DataFrame for Spark ML using the above function.
training_data = clean_data.rdd.map(vector_from_inputs).toDF(
    ["label", "features"]
)
training_data.cache()
# Construct a new LinearRegression object and fit the training data.
lr = LinearRegression(maxIter=5, regParam=0.2, solver="normal")
model = lr.fit(training_data)
# Print the model summary.
print("Coefficients:" + str(model.coefficients))
print("Intercept:" + str(model.intercept))
print("R^2:" + str(model.summary.r2))
model.summary.residuals.show()

import apache_beam as beam

class ApplyDoFn(beam.DoFn):

    def __init__(self):
        self._model = None

    def process(self, element):
        if self._model is None:
            self._model = LOAD_MODEL

        new_x = TRANSFORM(element)
        prediction = self._model.predict(new_x)[0]
        return [ { 'guid': element['guid'], 'prediction': prediction } ]

predictions = data | 'Apply Model' >> beam.ParDo(ApplyDoFn())

# define the pipeline steps
p = beam.Pipeline(options=pipeline_options)
data = p | 'Read Data' >> beam.io.Read(DATA_SOURCE)
scored = data | 'Apply Model for Each Element/User' >> beam.ParDo(ApplyDoFn())
scored | 'Save to BigQuery' >> beam.io.Write(beam.io.WriteToBigQuery(
                'prediction_Table', 'dataset_ID', schema = schema,
                create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
scored | 'Save to Other DB like Datastore' >> beam.ParDo(PublishDoFn())

# run the pipeline
result = p.run()
result.wait_until_finish()

	date	store_nbr	family	sales
0	2016-08-03	1	PERSONAL CARE	200.000
1	2016-08-04	1	PRODUCE	1909.962
2	2016-08-07	1	PRODUCE	1016.462
3	2016-08-09	1	PRODUCE	2044.128
4	2016-08-14	1	PERSONAL CARE	47.000

Batch Model Pipelines: Training and Predicting¶

0. Preparing Data in BigQuery¶

1. Training Model from BigQuery¶

1.1 Model Trainer¶

1.2 Containerize the Model Trainer¶

1.3 Scheduler with `Cloud Composer`¶

Multiple Tasks with Multiple Python Scripts¶

1.4 Batch Training Model¶

1.5 Tensorflow Model¶

1.6 AI Platform Training¶

2. Spark: Batch Model with PySpark and MLlib¶

2.1 MLlib¶

2.1 Train Model with MLlib¶

2.2 Submit Job in `Dataproc`¶

2.3 Scheduler¶

3. Batch Predicting Model¶

3.1 Google Cloud Dataflow¶

3.2 Process Data with `DoFn`¶

Prediction¶

Save to `Cloud Datastore`¶

3.3 Predicting Service¶

Batch Model Pipelines: Training and Predicting¶

0. Preparing Data in BigQuery¶

1. Training Model from BigQuery¶

1.1 Model Trainer¶

1.2 Containerize the Model Trainer¶

1.3 Scheduler with Cloud Composer¶

Multiple Tasks with Multiple Python Scripts¶

1.4 Batch Training Model¶

1.5 Tensorflow Model¶

1.6 AI Platform Training¶

2. Spark: Batch Model with PySpark and MLlib¶

2.1 MLlib¶

2.1 Train Model with MLlib¶

2.2 Submit Job in Dataproc¶

2.3 Scheduler¶

3. Batch Predicting Model¶

3.1 Google Cloud Dataflow¶

3.2 Process Data with DoFn¶

Prediction¶

Save to Cloud Datastore¶

3.3 Predicting Service¶

1.3 Scheduler with `Cloud Composer`¶

2.2 Submit Job in `Dataproc`¶

3.2 Process Data with `DoFn`¶

Save to `Cloud Datastore`¶