from pyspark.sql.functions import col import argparse def transform_data(data_source: str, output_uri: str) -> None: with SparkSession.builder.appName("Big Basket Analysis").getOrCreate() as spark: df = spark.read.option("header", "true").csv(data_source) df.createOrReplaceTempView("bigbasket_products") SQL_QUERY = [""" select avg(market_price) - avg(sale_price) from bigbasket_products """, """ select avg(rating) from bigbasket_products """, """ select avg(market_price) from bigbasket_products """, """ select avg(sale_price) from bigbasket_products """ ] for i in SQL_QUERY: transformed_df = spark.sql(i) transformed_df.show() transformed_df.write.mode("overwrite").parquet(output_uri) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--data_source') parser.add_argument('--output_uri') args = parser.parse_args() transform_data(args.data_source, args.output_uri)