Solution: Data Transformation
Let's see the solution to the data transformation Challenge.
We'll cover the following...
Task
Perform summary statistics on the review_text” and vote` columns.
Solution
def impute_NAN_values(df,columnName,value):
"""Replace the NaN values"""
df = df.fillna({columnName: value})
return df
def show_vote_stat(df: SparkDf) -> None:
"""
Show summary status about the vote
:param df: A Dataframe having asin and vote column
:return: No Return
"""
summary_df = (
df
.groupby("asin")
.agg(fn.mean(col("vote")).alias("mean_vote"))
.select("mean_vote")
.summary("count", "min", "25%", "75%", "max")
)
summary = summary_df.rdd.map(lambda row: row.asDict(recursive=True)).collect()
pprint(summary)
def show_review_text_stat(df: SparkDf) -> None:
"""
Show general Stats for review text length
:param df: DataFrame
:return: Nothing
"""
summary_df = (
df
.filter(col("review_text_len") > 0)
.select('review_text_len')
.summary("count", "min", "25%", "75%", "max")
)
summary = summary_df.rdd.map(lambda row: row.asDict(recursive=True)).collect()
print("Review Length Stat")
pprint(summary)
weired_reviews = df.filter(col('review_text_len') <= 1).count()
print(f"Reviews with length one or less: {weired_reviews}")
Solution of challenge data transformation
Explanation
- Line 3: We use the
.fillnamethod to impute the
Ask