final code

This commit is contained in:
Mihit 2025-03-16 22:28:18 +11:00
parent c67fca4f25
commit eef27bd373
2 changed files with 38361 additions and 11 deletions

38341
BigBasket_Products.csv Normal file

File diff suppressed because it is too large Load Diff

31
main.py
View File

@ -1,4 +1,3 @@
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import argparse
@ -8,17 +7,27 @@ def transform_data(data_source: str, output_uri: str) -> None:
df.createOrReplaceTempView("bigbasket_products")
SQL_QUERY = """
select product, count(*)
SQL_QUERY = ["""
select avg(market_price) - avg(sale_price)
from bigbasket_products
group by product
"""
transformed_df = spark.sql(SQL_QUERY)
print(f"Number of rows: {transformed_df.count()}")
transform_df.write.mode("overwrite").parquet(output_uri)
""",
"""
select avg(rating)
from bigbasket_products
""",
"""
select avg(market_price)
from bigbasket_products
""",
"""
select avg(sale_price)
from bigbasket_products
"""
]
for i in SQL_QUERY:
transformed_df = spark.sql(i)
transformed_df.show()
transformed_df.write.mode("overwrite").parquet(output_uri)
if __name__ == "__main__":
parser = argparse.ArgumentParser()