commit c67fca4f253af03f510ed071930965bc33db0182 Author: Mihit Date: Sun Mar 16 21:32:53 2025 +1100 init diff --git a/main.py b/main.py new file mode 100644 index 0000000..e28517f --- /dev/null +++ b/main.py @@ -0,0 +1,29 @@ +from pyspark.sql import SparkSession +from pyspark.sql.functions import col +import argparse + +def transform_data(data_source: str, output_uri: str) -> None: + with SparkSession.builder.appName("Big Basket Analysis").getOrCreate() as spark: + df = spark.read.option("header", "true").csv(data_source) + + df.createOrReplaceTempView("bigbasket_products") + + SQL_QUERY = """ + select product, count(*) + from bigbasket_products + group by product + """ + + transformed_df = spark.sql(SQL_QUERY) + + print(f"Number of rows: {transformed_df.count()}") + + transform_df.write.mode("overwrite").parquet(output_uri) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--data_source') + parser.add_argument('--output_uri') + args = parser.parse_args() + + transform_data(args.data_source, args.output_uri) \ No newline at end of file