This repository has been archived on 2025-03-19. You can view files and clone it, but cannot push or open issues or pull requests.
BigDataProject/main.py
2025-03-16 22:28:18 +11:00

38 lines
1.2 KiB
Python

from pyspark.sql.functions import col
import argparse
def transform_data(data_source: str, output_uri: str) -> None:
with SparkSession.builder.appName("Big Basket Analysis").getOrCreate() as spark:
df = spark.read.option("header", "true").csv(data_source)
df.createOrReplaceTempView("bigbasket_products")
SQL_QUERY = ["""
select avg(market_price) - avg(sale_price)
from bigbasket_products
""",
"""
select avg(rating)
from bigbasket_products
""",
"""
select avg(market_price)
from bigbasket_products
""",
"""
select avg(sale_price)
from bigbasket_products
"""
]
for i in SQL_QUERY:
transformed_df = spark.sql(i)
transformed_df.show()
transformed_df.write.mode("overwrite").parquet(output_uri)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--data_source')
parser.add_argument('--output_uri')
args = parser.parse_args()
transform_data(args.data_source, args.output_uri)