data_stream.py

import logging
import json
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as psf


# TODO Create a schema for incoming resources
""" 
From the StructField documentation: 
StructField(name, dataType, nullable)
:param name: the name of this field.
:param dataType: the data type of this field.
:param nullable: indicates whether values of this field can be null.
"""
schema = StructType(
    [
        StructField("crime_id", StringType(), True),                   
        StructField("original_crime_type_name", StringType(), True),                
        StructField("report_date", StringType(), True),            
        StructField("call_date", StringType(), True),                     
        StructField("offense_date", StringType(), True),                    
        StructField("call_time", StringType(), True),                    
        StructField("call_date_time", StringType(), True),                     
        StructField("disposition", StringType(), True),                     
        StructField("address", StringType(), True),                     
        StructField("city", StringType(), True),                     
        StructField("state", StringType(), True),                     
        StructField("agency_id", StringType(), True),                    
        StructField("address_type", StringType(), True),                    
        StructField("common_location", StringType(), True)
    ]
)


def run_spark_job(spark):
    # TODO Create Spark Configuration
    # Create Spark configurations with max offset of 200 per trigger
    # set up correct bootstrap server and port
    logger.debug("Creating Spark Configuration..")
    df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "calls") \
        .option("startingOffsets", "earliest") \
        .option("maxRatePerPartition", 100) \
        .option("maxOffsetPerTrigger", 200) \
        .load()

    # Show schema for the incoming resources for checks
    df.printSchema()

    # TODO extract the correct column from the kafka input resources
    # Take only value and convert it to String
    logger.debug("Extracting column of interest..")
    kafka_df = df.selectExpr("CAST(value AS STRING)")

    service_table = kafka_df \
        .select(psf.from_json(psf.col('value'), schema).alias("SERVICE")) \
        .select("SERVICE.*")

    # TODO select original_crime_type_name and disposition
    logger.debug("Selecting original_crime_type_name and disposition..")
    distinct_table = service_table.select(
        psf.to_timestamp(psf.col("call_date_time")).alias("call_date_time"),
        psf.col("original_crime_type_name"),
        psf.col("disposition")
    )
    distinct_table.printSchema()

    # count the number of original crime type
    logger.debug("Counting the number of crimes..")
    agg_df = distinct_table \
        .select(
        distinct_table.call_date_time,
        distinct_table.original_crime_type_name,
        distinct_table.disposition
    ) \
        .withWatermark("call_date_time", "60 minutes") \
        .groupBy(
        psf.window(distinct_table.call_date_time, "10 minutes"),
        psf.col("original_crime_type_name")
    ) \
        .count()
    
    # TODO Q1. Submit a screen shot of a batch ingestion of the aggregation
    # TODO write output stream
    print("Writing output stream..")
    query = agg_df \
        .writeStream \
        .format("console") \
        .outputMode("complete") \
        .start()


    # TODO attach a ProgressReporter
    query.awaitTermination()

    # TODO get the right radio code json path
    radio_code_json_filepath = "radio_code.json"
    radio_code_df = spark.read.json(radio_code_json_filepath)

    # clean up your data so that the column names match on radio_code_df and agg_df
    # we will want to join on the disposition code

    # TODO rename disposition_code column to disposition
    radio_code_df = radio_code_df.withColumnRenamed("disposition_code", "disposition").collect()

    # TODO join on disposition column
    join_query = agg_df.join(radio_code_df, 
                             col("agg_df.disposition") == col("radio_code_df.disposition"), 
                             "left_outer")

    join_query.awaitTermination()


if __name__ == "__main__":
    logger = logging.getLogger(__name__)

    # TODO Create Spark in Standalone mode
    spark = SparkSession \
        .builder \
        .config("spark.ui.port", 3000) \
        .master("local[*]") \
        .appName("KafkaSparkStructuredStreaming") \
        .getOrCreate()

    logger.info("Spark started")

    run_spark_job(spark)

    spark.stop()