from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
path='/content/drive/Shareddrives/Introduction to Data Science/data/'

Mounted at /content/drive/


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import seaborn as sns
import datetime
import matplotlib
import json
from scipy.stats import gaussian_kde
from scipy.stats import zscore
from scipy import stats
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler


import json
import numpy as np
import pandas as pd

product = []
with open(path + 'meta_All_Beauty.json') as f:
    for line in f:
        product.append(json.loads(line))


raw_product_table = pd.DataFrame(product)
raw_product_table.head(10)


import json
import numpy as np
import pandas as pd

review = []
with open(path + 'Review_Beauty.json') as f:
    for line in f:
        review.append(json.loads(line))


raw_review_table = pd.DataFrame(review)
raw_review_table.head(10)


header_names = ['ProductID', 'UserID', 'Review', 'Date']
review_simplified = pd.read_csv(path + 'All_Beauty.csv', header=None, names=header_names)
review_simplified.head(50)


review_simplified.shape

(371345, 4)


import json
import numpy as np
import pandas as pd

luxuryBeautyReview = []
with open(path + 'Luxury_Beauty.json') as f:
    for line in f:
        luxuryBeautyReview.append(json.loads(line))


raw_luxuryBeautyReview = pd.DataFrame(luxuryBeautyReview)
raw_luxuryBeautyReview


import json
import numpy as np
import pandas as pd

luxuryBeautyProduct = []
with open(path + 'meta_Luxury_Beauty.json') as f:
    for line in f:
        luxuryBeautyProduct.append(json.loads(line))


raw_luxuryBeautyProduct = pd.DataFrame(luxuryBeautyProduct)
raw_luxuryBeautyProduct


#drop irrelevant observations
product_table = raw_product_table.drop(['category','tech1','description', 'fit','image','tech2','feature','main_cat','similar_item','date'], axis=1)


# some ASIN inside details has space after 
# example  'ASIN: ' vs 'ASIN:'
# but going to drop 'details' because didn't use it
# this is to check whether the asin and the ASIN inside details is correct

count = 0
index_to_drop = [] 

for i in range(len(product_table.index)):
    dct = product_table['details'].iloc[i]
    if 'ASIN: ' in dct: 
        asin = dct['ASIN: ']
    elif 'ASIN:' in dct:
        asin = dct['ASIN:']
    if asin != product_table['asin'].iloc[i]:
        index_to_drop.append(i)
        count += 1
        
print(count)

437


##drop irrelevant observations
product_table = raw_product_table.drop(['category','tech1','description','fit','image','tech2','brand','feature','main_cat','similar_item','date','details'],axis = 1)

##drop duplicates
#no duplicate asins and row
product_table.drop_duplicates(['asin'],inplace = True)
product_table.reset_index(drop=True, inplace=True)

##fix structural errors

#for column 'rank' 
#remove uneccessary symbol like comma ',' and words
#convert the string into integer 
product_table['rank'].replace(',','', regex=True, inplace=True)
product_table['rank'] = product_table['rank'].str.extract('(\d+)')

#for column 'price'
#if 'price' don't start with $ or has '-' means null value, fillna with 'unknown'
#remove uneccessary symbol like comma ',' and '$'
product_table.loc[product_table["price"].str[0] != "$", "price"] = "unknown"
product_table.loc[product_table["price"].str.contains('-'), "price"] = "unknown"
product_table['price'].replace("\$","", regex=True, inplace=True)
product_table['price'].replace(',','', regex=True, inplace=True)

##deal with missing data
#missing data in 'rank' and 'price'
#product_table.replace('\[]','', regex=True, inplace=True)
product_table['rank'].fillna(0,inplace = True)
product_table['price'].replace("unknown",0, regex=True, inplace=True)
product_table.fillna("unknown",inplace = True)

##changing data type
product_table['rank']= product_table['rank'].astype(int)
product_table['price'] = product_table['price'].apply(pd.to_numeric)

##Identify and handle outliers
z_scores = stats.zscore(pd.DataFrame(product_table['rank']))
abs_z_scores = np.abs(z_scores)
outlier = (abs_z_scores > 3).all(axis=1)
product_table.drop(product_table[outlier].index, inplace = True)

product_table.shape

(32487, 6)


unique = product_table["asin"].value_counts()
unique

B01968IMO2    1
B000GGKPXK    1
B00BXVT9G6    1
B00BLUYOM8    1
B00SVDX5W8    1
             ..
B01E6L9C2U    1
B00RW5XT9A    1
B00UZWNZDW    1
B00GMPUFPW    1
B00EVPFNKW    1
Name: asin, Length: 32487, dtype: int64


##drop irrelevant observations
review_table = raw_review_table.drop(['reviewerName','reviewText','summary','style','image'],axis = 1)

##drop duplicates
review_table.drop_duplicates(inplace = True)
review_table.reset_index(drop=True, inplace=True)

##fix structural errors
#for column 'vote'
#remove uneccessary symbol ','
review_table['vote'].replace(',','', regex=True, inplace=True)
test_review_table = review_table.copy(deep=True)

##deal with missing data
review_table['vote'].fillna(0,inplace = True)
review_table.fillna('unknown',inplace = True)

##changing data type
review_table['vote'] = review_table['vote'].astype(int)

#separate reviewTime into three different columns
review_table['reviewTime'].replace(',','', regex=True, inplace=True)
date = review_table['reviewTime'].str.split(' ', n=2,expand = True)
date.columns =['month','day','year']

#drop reviewTime column
review_table.drop(['reviewTime'],axis = 1,inplace = True)

#concat both of the tables
review_table = pd.concat([review_table,date], axis=1)

#rearrange sequence of the columns
cols = review_table.columns.tolist()
cols = cols[-1:] + cols[-3:-1] + cols[:-3]
review_table = review_table[cols]

review_table


##drop irrelevant observations
luxuryBeautyReview_table = raw_luxuryBeautyReview.drop(['reviewerName','reviewText','summary','style','image'],axis = 1)

##drop duplicates
luxuryBeautyReview_table.drop_duplicates(inplace = True)
luxuryBeautyReview_table.reset_index(drop=True, inplace=True)

##fix structural errors
#for column 'vote'
#remove uneccessary symbol ','
luxuryBeautyReview_table['vote'].replace(',','', regex=True, inplace=True)
#test_review_table = review_table.copy(deep=True)

##deal with missing data
luxuryBeautyReview_table['vote'].fillna(0,inplace = True)
luxuryBeautyReview_table.fillna('unknown',inplace = True)

##changing data type
luxuryBeautyReview_table['vote'] = luxuryBeautyReview_table['vote'].astype(int)

#separate reviewTime into three different columns
luxuryBeautyReview_table['reviewTime'].replace(',','', regex=True, inplace=True)
date_luxuryBeautyReview = luxuryBeautyReview_table['reviewTime'].str.split(' ', n=2,expand = True)
date_luxuryBeautyReview .columns =['month','day','year']

#drop reviewTime column
luxuryBeautyReview_table.drop(['reviewTime'],axis = 1,inplace = True)

#concat both of the tables
luxuryBeautyReview_table = pd.concat([luxuryBeautyReview_table,date_luxuryBeautyReview ], axis=1)

#rearrange sequence of the columns
cols_LB = luxuryBeautyReview_table.columns.tolist()
cols_LB = cols_LB[-1:] + cols_LB[-3:-1] + cols_LB[:-3]
luxuryBeautyReview_table = luxuryBeautyReview_table[cols_LB]

luxuryBeautyReview_table


##drop irrelevant observations
luxuryBeautyProduct_table = raw_luxuryBeautyProduct.drop(['category','tech1','description','fit','image','tech2','brand','feature','main_cat','similar_item','date','details'],axis = 1)

##drop duplicates
#no duplicate asins and row
luxuryBeautyProduct_table.drop_duplicates(['asin'],inplace = True)
luxuryBeautyProduct_table.reset_index(drop=True, inplace=True)

##fix structural errors

#for column 'rank' 
#remove uneccessary symbol like comma ',' and words
#convert the string into integer 
luxuryBeautyProduct_table['rank'].replace(',','', regex=True, inplace=True)
luxuryBeautyProduct_table['rank'] = luxuryBeautyProduct_table['rank'].str.extract('(\d+)')

#for column 'price'
#if 'price' don't start with $ or has '-' means null value, fillna with 'unknown'
#remove uneccessary symbol like comma ',' and '$'
luxuryBeautyProduct_table.loc[luxuryBeautyProduct_table["price"].str[0] != "$", "price"] = "unknown"
luxuryBeautyProduct_table.loc[luxuryBeautyProduct_table["price"].str.contains('-'), "price"] = "unknown"
luxuryBeautyProduct_table['price'].replace("\$","", regex=True, inplace=True)
luxuryBeautyProduct_table['price'].replace(',','', regex=True, inplace=True)

##deal with missing data
#missing data in 'rank' and 'price'
#product_table.replace('\[]','', regex=True, inplace=True)
luxuryBeautyProduct_table['rank'].fillna(0,inplace = True)
luxuryBeautyProduct_table['price'].replace("unknown",0, regex=True, inplace=True)
luxuryBeautyProduct_table.fillna("unknown",inplace = True)

##changing data type
luxuryBeautyProduct_table['rank']= luxuryBeautyProduct_table['rank'].astype(int)
luxuryBeautyProduct_table['price'] = luxuryBeautyProduct_table['price'].apply(pd.to_numeric)

##Identify and handle outliers
z_scores_LB = stats.zscore(pd.DataFrame(luxuryBeautyProduct_table['rank']))
abs_z_scores_LB = np.abs(z_scores_LB)
outlier_LB = (abs_z_scores_LB > 3).all(axis=1)
luxuryBeautyProduct_table.drop(luxuryBeautyProduct_table[outlier_LB].index, inplace = True)

luxuryBeautyProduct_table


#min-max normalization
product_table['rank'] = (product_table['rank'] - product_table['rank'].min())/(product_table['rank'].max()-product_table['rank'].min())
product_table


# Outer merge of review_table & product_table
dfMergedOuter = (pd.merge(product_table,review_table, on='asin', how='outer', indicator=True)).set_index('asin')

# Outer left (product_table), to find out the product that doesn't have any review
dfMergedOuterProduct=dfMergedOuter[dfMergedOuter['_merge']=='left_only']
dfMergedOuterProduct


# Outer merge of review_table & product_table
dfMergedOuter2 = pd.merge(review_table, product_table, on='asin', how='outer', indicator=True)

# Outer left (review_table), to find out the product review that doesn't exist in product_table
dfMergedOuterReview=dfMergedOuter2[dfMergedOuter2['_merge']=='left_only']
dfMergedOuterReview


# to find the intersection betweeen both tables
dfMergedInner = (pd.merge(product_table,review_table, on='asin', how='inner', indicator=True)).set_index('asin')
dfMergedInner.reset_index(inplace = True)
dfMergedInner


# Drop both products since they do not have any reviews
product_table.drop(product_table[product_table['asin']=='B000NICBSI'].index, inplace = True) 
product_table.drop(product_table[product_table['asin']=='B01HHYC4UA'].index, inplace = True)


# Update our review table using the inner merge
final_review_table = dfMergedInner.drop(['title','also_buy','rank','also_view','price','reviewerID', '_merge'],axis = 1)
final_review_table


dfMergedInner


# calculates average rating of each product
# -----------------------------------------
# 1 parameter:
# review_arg['asin', 'overall']
# -----------------------------------------
# return:
# avg_review['asin', 'average-rating']
# -----------------------------------------

def avgReview(review_arg):
  avg_review = review_arg.groupby(['asin']).mean()
  avg_review.rename(columns={'overall':'average-rating'},inplace=True)
  avg_review.reset_index(inplace = True)
  return avg_review


# calculates number of reviews of each product
# --------------------------------------------
# 1 parameter:
# review['asin', 'overall']
# --------------------------------------------
# return:
# count_review ['asin', 'number-of-review']
# --------------------------------------------

def countReview(review_arg):
  count_review = review_arg.groupby(['asin']).count()
  count_review.rename(columns={'overall':'number-of-review'},inplace=True)
  count_review.reset_index(inplace = True)
  return count_review


# merge 3 dataframe together
# --------------------------------------------
# 3 paramters:
# avg_review_arg ['asin', 'average-rating']
# count_review_arg ['asin', 'number-of-review']
# product_rank_arg ['asin','rank']
# --------------------------------------------
# result:
# merged_review_table ['asin', 'average-rating', 'number-of-review', 'rank']
# --------------------------------------------

def merge_avgReview_numReview_product(avg_review_arg, count_review_arg, product_rank_arg):
# merge average rating score
  merged_review_table = (pd.merge(product_rank_arg, avg_review_arg, on='asin', how='inner', indicator=True)).set_index('asin')
  merged_review_table.reset_index(inplace=True)
  merged_review_table = merged_review_table.drop(['_merge'],axis = 1)
  merged_review_table.rename(columns={'overall':'average-rating'},inplace=True)

  # merge number of reviews
  merged_review_table = (pd.merge(merged_review_table, count_review_arg, on='asin', how='inner', indicator=True)).set_index('asin')
  merged_review_table.reset_index(inplace = True)
  merged_review_table = merged_review_table.drop(['_merge'],axis = 1)
  merged_review_table.rename(columns={'overall':'number-of-review'},inplace=True)

  # filter out product with 0 as sales rank (32083 products left)
  merged_review_table = merged_review_table[merged_review_table['rank'] > 0]
  merged_review_table.sort_values(by=['rank']) 

  return merged_review_table


# calculate avg_review and count_review, and then merge with product_rank
# --------------------------------------------
# 2 parameters:
# review_arg ['asin', 'overall']
# product_rank_arg ['asin','rank']
# --------------------------------------------
# return:
# merged_review_table ['asin', 'rank', 'average-rating, 'number-of-review']
# --------------------------------------------

def process_merged_review(review_arg, product_rank_arg):
    avg_review = avgReview(review_arg)
    count_review = countReview(review_arg)
    merged_review_table = merge_avgReview_numReview_product(avg_review, count_review, product_rank_arg)
    return merged_review_table


# calculates number of verified reviews of each product
# --------------------------------------------
# 2 parameters:
# verified_review ['asin', 'overall', 'verified']
# merged_review_table
# --------------------------------------------
# return:
# mv_review_table ['asin', 'rank', 'average-rating, 'number-of-review',
# 'average-verified-rating', 'verified']
# --------------------------------------------

def process_verified_review(verified_review, merged_review_table_arg):
  # gets verified reviews' average score (left 30430 rows)
  avg_verified_review = verified_review[verified_review['verified'] == True]
  avg_verified_review = avg_verified_review.groupby(['asin']).mean()
  avg_verified_review.rename(columns={'overall':'average-verified-rating'},inplace=True)
  avg_verified_review.drop(['verified'],axis=1,inplace=True)
  avg_verified_review.reset_index(inplace = True)
  # merge merged_review_table && avg_verified_review
  copy_merged_review = merged_review_table_arg.copy()
  #mv_review_table = (pd.merge(merged_review_table_arg.copy(), avg_verified_review, on='asin', how='left', indicator=True).fillna(0)).set_index('asin')
  copy_merged_review['average-verified-rating'] = copy_merged_review.asin.map(avg_verified_review\
                                                                             .set_index('asin')['average-verified-rating'])\
                                                                             .fillna(0).astype(float)
  # return
  return copy_merged_review


review = final_review_table[['asin', 'overall']]
product_rank = product_table[['asin','rank']]
merged_review_table = process_merged_review(review, product_rank)
merged_review_table


vr = final_review_table[['asin', 'overall', 'verified']]
mv_review_table = process_verified_review(vr, merged_review_table)
mv_review_table


merged_review_table.corr()


def filterByYear(x, data):
    result = data['year'] == x
    result = data[result]
    return result


# max date review groupby 'asin'
product_max_date = final_review_table.groupby(['asin'], sort=True)['year'].max()
product_max_date = pd.DataFrame(product_max_date)
product_max_date.reset_index(level=0, inplace=True)

# min date review groupby 'asin'
product_min_date = final_review_table.groupby(['asin'], sort=True)['year'].min()
product_min_date = pd.DataFrame(product_min_date)
product_min_date.reset_index(level=0, inplace=True)


# merge max and min together
# rename column and reset index
product_min_max_date = (pd.merge(product_max_date, product_min_date, on='asin', how='inner', indicator=True)).set_index('asin')
product_min_max_date.rename(columns = {'year_x':'max','year_y':'min'}, inplace = True)
product_min_max_date.drop(['_merge'], axis=1, inplace =True)
product_min_max_date.reset_index(level=0, inplace=True)

# calculate diff between min and max
product_min_max_date = product_min_max_date.astype({'max': 'int', 'min': 'int'})
product_min_max_date['diff'] = (product_min_max_date['max'] - product_min_max_date['min'] + 1)

# final table
product_min_max_date


def calculateDensity(x_axis, y_axis):
    x= x_axis
    y= y_axis

    # Calculate the point density
    xy = np.vstack([x,y])
    z = gaussian_kde(xy)(xy)
    
    # Sort the points by density, so that the densest points are plotted last
    #idx = z.argsort()
    #x, y, z = x[idx], y[idx], z[idx]
    
    #return density
    return z


#merging
#to find the intersection betweeen both tables
luxuryBeautyMergedInner = (pd.merge(luxuryBeautyProduct_table,luxuryBeautyReview_table, on='asin', how='inner', indicator=True)).set_index('asin')
luxuryBeautyMergedInner.reset_index(inplace = True)
luxuryBeautyMergedInner


final_luxuryBeautyReview_table = luxuryBeautyMergedInner.drop(['title','also_buy','rank','also_view','price','reviewerID', '_merge'],axis = 1)
final_luxuryBeautyReview_table


review_luxuryBeauty = final_luxuryBeautyReview_table[['asin', 'overall']]
luxuryBeautyProduct_rank = luxuryBeautyProduct_table[['asin','rank']]
merged_luxuryBeautyReview_table = process_merged_review(review_luxuryBeauty, luxuryBeautyProduct_rank)
merged_luxuryBeautyReview_table


luxuryBeautyVR = final_luxuryBeautyReview_table[['asin', 'overall', 'verified']]
mv_luxuryBeautyReview_table = process_verified_review(luxuryBeautyVR, merged_luxuryBeautyReview_table)
mv_luxuryBeautyReview_table


# suitable for passing merged_review_table or mv_review_table
def CalculateluxuryBeautyTimeWeightedAvg(rev_table):

    frt_copy = final_luxuryBeautyReview_table.copy()
    mmScaler = MinMaxScaler()
    frt_copy[['unixReviewTime']] = mmScaler.fit_transform(frt_copy[['unixReviewTime']])
    frt_copy['time-weighted-overall'] = frt_copy['overall'] * frt_copy['unixReviewTime'] 
    
    g = frt_copy[['asin', 'time-weighted-overall']]
    g = g.groupby(['asin'])['time-weighted-overall'].mean()
    time_score = pd.DataFrame(g)#, columns=['time-weighted-avg'])
    time_score.reset_index(inplace=True)
    
    rev_table = (pd.merge(rev_table, time_score, on='asin', how='inner', indicator=True)).set_index('asin')
    rev_table.reset_index(inplace = True)
    rev_table = rev_table.drop(['_merge'],axis = 1)

    return rev_table

luxuryBeautyReview_table_withTime = CalculateluxuryBeautyTimeWeightedAvg(mv_luxuryBeautyReview_table )
luxuryBeautyReview_table_withTime


bins = np.linspace(min(merged_review_table['rank']), max(merged_review_table['rank']), 5)
bins

array([1.00533055e-05, 2.50007540e-01, 5.00005027e-01, 7.50002513e-01,
       1.00000000e+00])


group_names = ['Very Good', 'Good', 'Moderate', 'Bad']
merged_review_table['rank-binned'] = pd.cut(merged_review_table['rank'], bins, labels=group_names, include_lowest=True)
merged_review_table.head()


plt.figure(figsize = (5,4))
plt.bar(group_names, merged_review_table['rank-binned'].value_counts())
plt.title("Number of Product against Sales Rank")
plt.xlabel('Sales Rank')
plt.ylabel('Number of Product')

Text(0, 0.5, 'Number of Product')


bins_luxuryBeauty = np.linspace(min(luxuryBeautyReview_table_withTime['rank']), max(luxuryBeautyReview_table_withTime['rank']), 5)
rank_names = ['Very Good', 'Good', 'Moderate', 'Bad']
luxuryBeautyReview_table_withTime['rank-binned'] = pd.cut(luxuryBeautyReview_table_withTime['rank'], bins_luxuryBeauty, labels=rank_names, include_lowest=True)
luxuryBeautyReview_table_withTime


plt.figure(figsize = (5,4))
bp = sns.boxplot(x='rank-binned', y='average-rating', data=merged_review_table)


average_v_rank = sns.scatterplot(x='average-rating', 
                                 y='rank',
                                 data=merged_review_table)

average_v_rank.set(xlabel='Average Rating of Beauty Product', 
                   ylabel='Sales Rank', 
                   title="Average Rating of Beauty Products against Sales Rank")

[Text(0, 0.5, 'Sales Rank'),
 Text(0.5, 0, 'Average Rating of Beauty Product'),
 Text(0.5, 1.0, 'Average Rating of Beauty Products against Sales Rank')]


sns.set(rc={'figure.figsize':(15,10)})
sns.scatterplot(x='average-rating', y='rank', data = merged_review_table, hue='rank-binned')

<matplotlib.axes._subplots.AxesSubplot at 0x7fc70886fa50>


num_review_am200 = merged_review_table.copy()
num_review_am200 = num_review_am200[num_review_am200['number-of-review']<= 200]


num_review_am200['average-rating-range'] = pd.cut(num_review_am200['average-rating'],
                                                  [0, 3, 4,6], 
                                                  right=False,
                                                  labels=["[1, 3)","[3, 4)","[4, 5]"])
num_review_am200


#Finding correlation for 3 different categories of rating
low_rating = num_review_am200['average-rating-range'] == "[1, 3)"
mod_rating = num_review_am200['average-rating-range'] == "[3, 4)"
high_rating = num_review_am200['average-rating-range'] == "[4, 5]"

low_rating_table = num_review_am200[low_rating]
mod_rating_table = num_review_am200[mod_rating]
high_rating_table = num_review_am200[high_rating]
print(high_rating_table['average-rating'].corr(high_rating_table['rank']),"\n")
print(mod_rating_table['average-rating'].corr(mod_rating_table['rank']),"\n")
print(low_rating_table['average-rating'].corr(low_rating_table['rank']))

0.19258599024477144 

-0.4334039295627057 

-0.3834335522148039


sns.set(rc={'figure.figsize':(15,10)})
sns.scatterplot(x='average-rating', y='rank', data=num_review_am200, hue='average-rating-range', palette=["k", "C3","C0"])

<matplotlib.axes._subplots.AxesSubplot at 0x7fc7087d79d0>


sns.set(rc={'figure.figsize':(15,10)})
sns.scatterplot(x='number-of-review', y='rank', data=num_review_am200, hue='average-rating-range', palette=["k", "C3","C0"])

<matplotlib.axes._subplots.AxesSubplot at 0x7fc7087d7910>


plt.subplot(3, 1, 1)
sns.set(rc={'figure.figsize':(15,10)})
sns.scatterplot(x='number-of-review', y='rank', data=low_rating_table, color="k")

plt.subplot(3, 1, 2)
sns.set(rc={'figure.figsize':(15,10)})
sns.scatterplot(x='number-of-review', y='rank', data=mod_rating_table, color="C3")

plt.subplot(3, 1, 3)
sns.set(rc={'figure.figsize':(15,10)})
sns.scatterplot(x='number-of-review', y='rank', data=high_rating_table, color="C0")

<matplotlib.axes._subplots.AxesSubplot at 0x7fc708517110>


num_review = merged_review_table[['asin','number-of-review', 'rank']]
num_review = num_review.sort_values(by=['rank'])
num_review


num_review.corr()


# sns.heatmap(num_review.corr(),cmap='jet')


# figure, axes = plt.subplots(1, 2)
# num_review.plot(ax=axes[0])
# sns.scatterplot(x="number-of-review", y="rank", data=num_review)


# plt.figure(figsize = (10,8))
# sns.regplot(x="number-of-review", y="rank", data=num_review)
# plt.ylim(0,)


# filter out num_review that are too high (consider them as outlier)
num_review_filter = num_review[num_review['number-of-review']<200]
num_review_filter.shape

# 32083(total all) 
# 32029(<500) / 54 (>=500)
# 31856(<200) / 227 (>=200)
# 31540(<100) / 543 (>=100)

(31856, 3)


# square brackets '[]' mean the end point is included
# round parentheses '()' mean it's excluded
num_review_filter['num-review-range'] = pd.cut(num_review_filter['number-of-review'],
                                                  [0, 4, 7, 10, 200], 
                                                  right=True,
                                                  labels=['[1,4]', '(4,7]', '(7,10]', '(10,200)'])
num_review_filter

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


num_review_filter['num-review-range'].value_counts().sort_index

<bound method Series.sort_index of [1,4]       23318
(10,200)     4273
(4,7]        2818
(7,10]       1447
Name: num-review-range, dtype: int64>


sns.set(rc={'figure.figsize':(15,10)})
sttrplt=sns.scatterplot(x='number-of-review', y='rank', data = num_review_filter, hue='num-review-range')
plt.annotate(
# Label and coordinate
'Hard Candy Glow All The Way', xy=(110,0.86), xytext=(124, 0.9), color= "red", arrowprops={"facecolor": 'red', "shrink": 0.15}
)
plt.annotate(
# Label and coordinate
'Death and Dentistry', xy=(122,0.82), xytext=(137, 0.86), color= "red", arrowprops={"facecolor": 'red', "shrink": 0.20}
)
plt.annotate(
# Label and coordinate
'Norelco 8894 XL \nSpectra Shaver', xy=(107,0.64), xytext=(122, 0.70), color= "red", arrowprops={"facecolor": 'red', "shrink": 0.20}
)
plt.annotate(
# Label and coordinate
'My New Pink \nButton - Audry', xy=(160,0.62), xytext=(175, 0.68), color= "red", arrowprops={"facecolor": 'red', "shrink": 0.20}
)

sttrplt.set(xlabel='Total Number of Reviews', 
            ylabel='Sales Rank', 
            title="Total Number of Reviews against Sales Rank")

[Text(0, 0.5, 'Sales Rank'),
 Text(0.5, 0, 'Total Number of Reviews'),
 Text(0.5, 1.0, 'Total Number of Reviews against Sales Rank')]


num_review_filter[num_review_filter['number-of-review'] <=50].corr()


rating_grouping = final_review_table[['asin', 'overall']]

# count by number of high rating reviews
high_review_count = rating_grouping.groupby(['asin'])['overall'].apply(lambda x: (x >= 4).sum())
high_review_count = pd.DataFrame(high_review_count)
high_review_count.rename(columns={'overall':'number-of-high-rating-review'},inplace=True)
high_review_count.reset_index(inplace = True)

# count by number of low rating reviews
low_review_count = rating_grouping.groupby(['asin'])['overall'].apply(lambda x: (x < 4).sum())
low_review_count = pd.DataFrame(low_review_count)
low_review_count.rename(columns={'overall':'number-of-low-rating-review'},inplace=True)
low_review_count.reset_index(inplace = True)


# m is a updated version of merged_review_table, with column recording the number of high & low rating reviews of each products
m = (pd.merge(high_review_count, low_review_count, on='asin', how='inner', indicator=True)).set_index('asin')
m.reset_index(inplace=True)
m = m.drop(['_merge'],axis = 1)
m

m = (pd.merge(merged_review_table, m, on='asin', how='inner', indicator=True)).set_index('asin')
m.reset_index(inplace=True)
m = m.drop(['_merge'],axis = 1)
m


filter_num_review_50 = m['number-of-review'] <=50
m = m[filter_num_review_50]
m[['number-of-review','rank']].corr()


nReview50 = sns.scatterplot(x="number-of-review", y="rank", data=m,color="blue")
nReview50.set(title="Number of Reviews(<=50) against Sales Rank")

[Text(0.5, 1.0, 'Number of Reviews(<=50) against Sales Rank')]


# ploting a graph to see the relationship between number of high rating reviews (those that are >= 4), and the sales rank
plt.subplot(2, 1, 1)
# getting those products that have at least one high rating review
m1 = m[m['number-of-high-rating-review'] > 0]
# we have 27858 products here
sns.scatterplot(x="number-of-high-rating-review", y="rank", data=m1)


# ploting a graph to see the relationship between number of low rating reviews (those that are < 4), and the sales rank
plt.subplot(2, 1, 2)
# getting those products that have at least one low rating review
m2 = m[m['number-of-low-rating-review'] > 0]
# we have 15950 products here
sns.scatterplot(x="number-of-low-rating-review", y="rank", data=m2)

<matplotlib.axes._subplots.AxesSubplot at 0x7fc7156e8a10>


# filter final_review_table, groupby asin and year
p_AvgRating_ByYear = final_review_table.groupby(['asin','year'], sort=True)['overall'].agg(['mean', 'count'])
p_AvgRating_ByYear = pd.DataFrame(p_AvgRating_ByYear)
p_AvgRating_ByYear.reset_index(level=['asin','year'], inplace=True)

# filter merged_review_table
p_rank = merged_review_table[['asin', 'rank']]

# merge both average rating and rank
avgRating_byYear = (pd.merge(p_rank, p_AvgRating_ByYear, on='asin', how='inner', indicator=True))
avgRating_byYear.drop(['_merge'], axis=1, inplace =True)
avgRating_byYear = avgRating_byYear.astype({'year': 'int'})
avgRating_byYear.sort_values(by=['year'], inplace=True)
avgRating_byYear = avgRating_byYear[['year','asin','rank','count','mean']]

avgRating_byYear


# sort table according to rank, and reindex the rank
sorted_avgRating_byYear = avgRating_byYear.sort_values(by='rank')
sorted_avgRating_byYear.reset_index(drop=True,inplace=True)
sorted_avgRating_byYear['nrank'] = sorted_avgRating_byYear.index+1
sorted_avgRating_byYear


frame_list =[]
density_list = []
date_list = []
for x in range(2000, 2019):
    tempPd = filterByYear(x, sorted_avgRating_byYear)
    density = calculateDensity(tempPd['count'], tempPd['nrank'])
    tempPd = pd.DataFrame(tempPd)
    frame_list.append(tempPd)
    density_list.append(density)
    date_list.append(str(x))


#########
# If you're using Jupyter notebook
# Use this command to switch on matplotlib's backend, this will enable interactive UI

# %matplotlib notebook 

##########
# After running the animated gif
# use this command to switch off matplotlib's backend, this will disable interactive UI

# %matplotlib inline

#########
# To learn more about matplotlib
# %matplotlib --list


fig, ax = plt.subplots(figsize=(6,6))

ax.scatter([1,2000], [1,2000], c=[1,2000])
ax.set_title("title")
plt.xlabel("Number of Review")
plt.ylabel("Sales Rank")

#not in use (may omit)
def init():
    sc = ax.set_data([], [])
    return sc,

def animate(i):
    ax.scatter(frame_list[i]['count'], frame_list[i]['rank'], c=density_list[i], edgecolor='none')
    ax.set_title(date_list[i])
    
ani = matplotlib.animation.FuncAnimation(fig, animate, frames=19, interval=1000)#, interval=100)

plt.show()


vg_filter = merged_review_table['rank'] <= 870386.75
vg_table = merged_review_table[vg_filter]
vg_table.sort_values(by=['rank'])


bins_2 = np.linspace(min(vg_table['rank']), max(vg_table['rank']), 5)
bins_2

array([1.00533055e-05, 2.50007540e-01, 5.00005027e-01, 7.50002513e-01,
       1.00000000e+00])


group_names_2 = ['1', '2', '3', '4']
vg_table['upper-rank-binned'] = pd.cut(vg_table['rank'], bins_2, labels=group_names_2, include_lowest=True)
vg_table.head()


plt.figure(figsize = (5,4))
plt.bar(group_names_2, vg_table['upper-rank-binned'].value_counts())

<BarContainer object of 4 artists>


plt.figure(figsize = (5,4))
bp = sns.boxplot(x='upper-rank-binned', y='average-rating', data=vg_table)


vg_table.sort_values(by='average-rating')


vg_table_lt200 = vg_table[vg_table['number-of-review']<200].copy() # cleared 214 products
vg_table_top = vg_table_lt200[vg_table['upper-rank-binned'] == '1'].copy()

vg_table_top['average-rating-range'] = pd.cut(vg_table_top['average-rating'],
                                                  [0, 4, 6], 
                                                  right=False,
                                                  labels=["[1, 4)", "[4, 5]"])
vg_table_top.head()

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.


sns.set(rc={'figure.figsize':(10,8)})
sns.scatterplot(x='average-rating', y='rank', data=vg_table_top, hue='average-rating-range', palette=["k", "C3"])

<matplotlib.axes._subplots.AxesSubplot at 0x7fc707f72290>


sns.set(rc={'figure.figsize':(10,8)})
sns.scatterplot(x='number-of-review', y='rank', data=vg_table_top, hue='average-rating-range', palette=["k", "C3"])

<matplotlib.axes._subplots.AxesSubplot at 0x7fc707f26ed0>


#try to take data that have sales rank less than 50000
partial_table = merged_review_table[merged_review_table['rank'] <= 50000]

#min-max normalization
#partial_table['rank'] = (partial_table['rank'] - partial_table['rank'].min())/(partial_table['rank'].max()-partial_table['rank'].min())
partial_table


partial_table.corr()


num_review = partial_table[['asin','number-of-review', 'rank']]
num_review = num_review.sort_values(by=['rank'])
num_review


plt.figure(figsize = (10,8))
sns.scatterplot(x="number-of-review", y="rank", data=num_review)

<matplotlib.axes._subplots.AxesSubplot at 0x7fc707e4df90>


plt.figure(figsize = (10,8))
average_v_rank = sns.scatterplot(x='average-rating', 
                                 y='rank',
                                 data=partial_table)

average_v_rank.set(xlabel='Average Rating of Beauty Product', 
                   ylabel='Sales Rank', 
                   title="Average Rating of Beauty Products against Sales Rank")

[Text(0, 0.5, 'Sales Rank'),
 Text(0.5, 0, 'Average Rating of Beauty Product'),
 Text(0.5, 1.0, 'Average Rating of Beauty Products against Sales Rank')]


price_rank_table = product_table[['asin','rank','price']]
num_review_table = mv_review_table[['asin','number-of-review']]
# merge number of reviews
merged_price_table = (pd.merge(price_rank_table, num_review_table , on='asin', how='inner', indicator=True)).set_index('asin')
merged_price_table.reset_index(inplace = True)
merged_price_table = merged_price_table.drop(['_merge'],axis = 1)
merged_price_table.rename(columns={'overall':'number-of-review'},inplace=True)

# filter out product with 0 as sales rank 
#merged_price_table = merged_price_table[merged_price_table['rank'] > 0]
#merged_price_table = merged_price_table[merged_price_table['price'] > 0]

#remove outliers
price_z_scores = stats.zscore(pd.DataFrame(merged_price_table['price']))
price_abs_z_scores = np.abs(price_z_scores)
price_outlier = (price_abs_z_scores > 3).all(axis=1)
merged_price_table.drop(merged_price_table[price_outlier].index, inplace = True)
merged_price_table.sort_values(by=['rank'])


#min-max normalization
#price_rank_table['rank'] = (price_rank_table['rank'] - price_rank_table['rank'].min())/(price_rank_table['rank'].max()-price_rank_table['rank'].min())


filter_num_review = merged_price_table['number-of-review'] < 50
#filter_price = merged_price_table['price'] < 25
merged_price_table = merged_price_table[filter_num_review]
#merged_price_table = merged_price_table[filter_price]
merged_price_table.sort_values(by=['price'])


merged_price_table.corr()


# square brackets '[]' mean the end point is included
# round parentheses '()' mean it's excluded
merged_price_table['price-range'] = pd.cut(merged_price_table['price'],
                                                  [-1, 25, 75, 125, 200], 
                                                  right=True,
                                                  labels=['[0,25]', '(25,75]', '(75,125]', '(125,200]'])
merged_price_table


plt.figure(figsize = (15,10))
price_rank_graph = sns.scatterplot(x='price', 
                                 y='rank',
                                 data=merged_price_table, hue='price-range',palette=["k", "C3","C0","C2"])

price_rank_graph .set(xlabel='Price of Beauty Product', 
                   ylabel='Sales Rank', 
                   title="Price of Beauty Products against Sales Rank")

[Text(0, 0.5, 'Sales Rank'),
 Text(0.5, 0, 'Price of Beauty Product'),
 Text(0.5, 1.0, 'Price of Beauty Products against Sales Rank')]


# suitable for passing merged_review_table or mv_review_table
def calculateVoteScore(rev_table):
    
    g = final_review_table[['asin', 'overall', 'vote']].groupby(['asin']).apply(lambda x: (x['overall']*x['vote']).sum()/x['vote'].sum() if x['vote'].sum()!= 0 else 0)
    vote_score = pd.DataFrame(g, columns=['vote-score'])
    vote_score.reset_index(inplace=True)
    vote_score
    
    rev_table = (pd.merge(rev_table, vote_score, on='asin', how='inner', indicator=True)).set_index('asin')
    rev_table.reset_index(inplace = True)
    rev_table = rev_table.drop(['_merge'],axis = 1)
    # rev_table.rename(columns={'overall':'number-of-review'},inplace=True)

    return rev_table


mv_review_table = calculateVoteScore(mv_review_table)


new_mv_review_table = mv_review_table[mv_review_table['vote-score']>0]
new_mv_review_table


corr_test = mv_review_table[['rank', 'average-rating', 'number-of-review', 'average-verified-rating', 'vote-score']].corr()
corr_test


sns.heatmap(corr_test,square=True)

<matplotlib.axes._subplots.AxesSubplot at 0x7fc707cd6050>


from sklearn.preprocessing import MinMaxScaler 

# suitable for passing merged_review_table or mv_review_table
def CalculateTimeWeightedAvg(rev_table):

    frt_copy = final_review_table.copy()
    mmScaler = MinMaxScaler()
    frt_copy[['unixReviewTime']] = mmScaler.fit_transform(frt_copy[['unixReviewTime']])
    frt_copy['time-weighted-overall'] = frt_copy['overall'] * frt_copy['unixReviewTime'] 
    
    g = frt_copy[['asin', 'time-weighted-overall']]
    g = g.groupby(['asin'])['time-weighted-overall'].mean()
    time_score = pd.DataFrame(g)#, columns=['time-weighted-avg'])
    time_score.reset_index(inplace=True)
    
    rev_table = (pd.merge(rev_table, time_score, on='asin', how='inner', indicator=True)).set_index('asin')
    rev_table.reset_index(inplace = True)
    rev_table = rev_table.drop(['_merge'],axis = 1)

    return rev_table


new_mv_review_table_withTime = CalculateTimeWeightedAvg(mv_review_table)
new_mv_review_table_withTime


sns.set(rc={'figure.figsize':(10,5)})
bp = sns.boxplot(x=mv_review_table["rank"])
bp.set(title='The Sales Rank of Beauty Products',
       xlabel='Sales Rank')

[Text(0.5, 0, 'Sales Rank'),
 Text(0.5, 1.0, 'The Sales Rank of Beauty Products')]


sns.set(rc={'figure.figsize':(10,5)})
bp = sns.boxplot(x=mv_review_table["average-rating"])
bp.set(title='The Average Rating of Beauty Products',
       xlabel='Average Rating')

[Text(0.5, 0, 'Average Rating'),
 Text(0.5, 1.0, 'The Average Rating of Beauty Products')]


sns.set(rc={'figure.figsize':(10,5)})
bp = sns.boxplot(x=mv_review_table["average-verified-rating"], palette="Oranges_r")
bp.set(title='The Average Verified Rating of Beauty Products',
       xlabel='Average Verified Rating')

[Text(0.5, 0, 'Average Verified Rating'),
 Text(0.5, 1.0, 'The Average Verified Rating of Beauty Products')]


mv_review_table_combine=pd.melt(mv_review_table[['average-rating', 'average-verified-rating']])


cbp=sns.boxplot(x="variable", y="value", data=mv_review_table_combine)
cbp.set(title='Comparison Between Average Rating and Average Verified Rating',
        xlabel='Types of Rating',
        ylabel='Value')

[Text(0, 0.5, 'Value'),
 Text(0.5, 0, 'Types of Rating'),
 Text(0.5, 1.0, 'Comparison Between Average Rating and Average Verified Rating')]


sns.set(rc={'figure.figsize':(10,5)})
bp = sns.boxplot(x=mv_review_table["number-of-review"])
bp.set(title='The Number of Reviews of Beauty Products',
       xlabel='Number of Reviews')

[Text(0.5, 0, 'Number of Reviews'),
 Text(0.5, 1.0, 'The Number of Reviews of Beauty Products')]


sns.set(rc={'figure.figsize':(10,5)})
bp = sns.boxplot(x=new_mv_review_table["vote-score"], palette="Greens_r")
bp.set(title='The Vote Weighted Rating of Beauty Products',
       xlabel='Vote Weighted Rating')

[Text(0.5, 0, 'Vote Weighted Rating'),
 Text(0.5, 1.0, 'The Vote Weighted Rating of Beauty Products')]


sns.set(rc={'figure.figsize':(10,5)})
bp = sns.boxplot(x=new_mv_review_table_withTime["time-weighted-overall"], palette="Reds")
bp.set(title='The Time Weighted Rating of Beauty Products',
       xlabel='Time Weighted Rating')

[Text(0.5, 0, 'Time Weighted Rating'),
 Text(0.5, 1.0, 'The Time Weighted Rating of Beauty Products')]


combined_rating=pd.melt(new_mv_review_table_withTime[['average-rating','average-verified-rating','vote-score','time-weighted-overall']])


cbp=sns.boxplot(x="variable", y="value", data=combined_rating)
cbp.set(title='Comparison Between Average Rating and Time Weighted Rating',
        xlabel='Types of Rating',
        ylabel='Rating')

[Text(0, 0.5, 'Rating'),
 Text(0.5, 0, 'Types of Rating'),
 Text(0.5, 1.0, 'Comparison Between Average Rating and Time Weighted Rating')]


sns.set(rc={'figure.figsize':(10,5)})
bp = sns.boxplot(x=merged_price_table["price"])
bp.set(title='The Price of Beauty Products',
       xlabel='Price')

[Text(0.5, 0, 'Price'), Text(0.5, 1.0, 'The Price of Beauty Products')]


#preparing required table
partial_product_table = product_table[['asin','price']]

##Merging Price with new_mv_review_table_withTime
final_review_price_table = (pd.merge(new_mv_review_table_withTime, partial_product_table, on='asin', how='inner', indicator=True)).set_index('asin')
final_review_price_table.reset_index(inplace = True)
final_review_price_table = final_review_price_table .drop(['_merge'],axis = 1)
final_review_price_table


final_review_price_table.describe()


correlation = final_review_price_table.corr()


sns.heatmap(correlation, square=True)

<matplotlib.axes._subplots.AxesSubplot at 0x7fc7076f60d0>


from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules


also_buy_table = product_table.drop(['title','also_view','rank','price'],axis = 1)
also_buy_table


# separates each item in the list into a new row
also_buy_table = also_buy_table.explode('also_buy')
also_buy_table.reset_index(drop=True, inplace=True)

# cleaning up just to be safe
also_buy_table['also_buy'] = also_buy_table['also_buy'].str.strip()
also_buy_table.dropna(axis=0, subset=['also_buy'], inplace=True)
also_buy_table['also_buy'] = also_buy_table['also_buy'].astype('str')

# the product appear once in also buy (true, it was bought together)
also_buy_table['frequency'] = 1
also_buy_table


# dropping products that does not exist in our product_table dataset
also_buy = also_buy_table[also_buy_table['also_buy'].isin(product_table['asin'])]
also_buy.reset_index(drop=True, inplace=True)
also_buy


basket_ab = (also_buy.groupby(['asin', 'also_buy'])['frequency']
          .sum().unstack().reset_index().fillna(0)
          .set_index('asin')
         )
basket_ab


# support 
frequent_itemsets = apriori(basket_ab, min_support=0.01, use_colnames=True)
frequent_itemsets


# confidence rule
rules1_ab = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.01)
rules1_ab


# lift rule
rules2_ab = association_rules(frequent_itemsets, metric="lift", min_threshold=0.01)
rules2_ab


also_view_table = product_table.drop(['title','also_buy','rank','price'],axis = 1)
also_view_table


# Separates each item in the list into a new row
also_view_table = also_view_table.explode('also_view')
also_view_table.reset_index(drop=True, inplace=True)

# Cleaning up just to be safe
also_view_table['also_view'] = also_view_table['also_view'].str.strip()
also_view_table.dropna(axis=0, subset=['also_view'], inplace=True)
also_view_table['also_view'] = also_view_table['also_view'].astype('str')

# The product appear once in also buy (true, it was bought together)
also_view_table['frequency'] = 1
also_view_table = also_view_table[['asin', 'also_view','frequency']]
also_view_table


# Dropping products that does not exist in our product_table dataset
also_view = also_view_table[also_view_table['also_view'].isin(product_table['asin'])]
also_view.reset_index(drop=True, inplace=True)
also_view


basket_av = (also_view.groupby(['asin', 'also_view'])['frequency']
          .sum().unstack().reset_index().fillna(0)
          .set_index('asin')
         )
basket_av


frequent_itemsets_2 = apriori(basket_av, min_support=0.01, use_colnames=True)
frequent_itemsets_2


rules1_av = association_rules(frequent_itemsets_2, metric="confidence", min_threshold=0.001)
rules1_av


rules2_av = association_rules(frequent_itemsets_2, metric="lift", min_threshold=0.001)
rules2_av


also_buy_f = also_buy.groupby(['also_buy']).count()
also_buy_f.rename(columns={'frequency':'also-buy-freq'},inplace=True)
also_buy_f.drop(['asin'], axis = 1, inplace=True)
also_buy_f.reset_index(inplace = True)
also_buy_f.rename(columns={'also_buy':'asin'},inplace=True)
also_buy_f


mv2_review_table = (pd.merge(mv_review_table, also_buy_f, on='asin', how='left', indicator=True)).set_index('asin')
mv2_review_table.reset_index(inplace = True)
mv2_review_table['also-buy-freq'].fillna(0, inplace=True)
mv2_review_table.drop(['_merge'], axis=1, inplace=True)

mv2_review_table['also-buy-freq-range'] = pd.cut(mv2_review_table['also-buy-freq'],
                                                  [-1, 0, 10, 30], 
                                                  right=True,
                                                  labels=["0","(0, 10]","(10, 30]"])
mv2_review_table


mv2_review_table["also-buy-freq-range"].value_counts()

0           30920
(0, 10]      1154
(10, 30]        9
Name: also-buy-freq-range, dtype: int64


also_view_f = also_view.groupby(['also_view']).count()
also_view_f.rename(columns={'frequency':'also-view-freq'},inplace=True)
also_view_f.drop(['asin'], axis = 1, inplace=True)
also_view_f.reset_index(inplace = True)
also_view_f.rename(columns={'also_view':'asin'},inplace=True)
also_view_f


mv3_review_table = (pd.merge(mv_review_table, also_view_f, on='asin', how='left', indicator=True)).set_index('asin')
mv3_review_table.reset_index(inplace = True)
mv3_review_table['also-view-freq'].fillna(0, inplace=True)
mv3_review_table.drop(['_merge'], axis=1, inplace=True)

mv3_review_table['also-view-freq-range'] = pd.cut(mv3_review_table['also-view-freq'],
                                                  [-1, 0, 10, 30], 
                                                  right=True,
                                                  labels=["0","(0, 10]","(10, 30]"])
mv3_review_table


mv3_review_table["also-view-freq-range"].value_counts()

0           31033
(0, 10]      1047
(10, 30]        3
Name: also-view-freq-range, dtype: int64


#preparing required table
partial_product_table = product_table[['asin','price']]
#merging
merged_product_review_table = (pd.merge(merged_review_table, partial_product_table, on='asin', how='inner', indicator=True)).set_index('asin')
merged_product_review_table.reset_index(inplace = True)
merged_product_review_table = merged_product_review_table.drop(['_merge','rank'],axis = 1)
merged_product_review_table.rename(columns={'rank-binned':'rank'},inplace=True)

verified_rating_table = mv_review_table[['asin','average-verified-rating']][mv_review_table['average-verified-rating']>0]
merged_product_review_table = (pd.merge(merged_product_review_table, verified_rating_table, on='asin', how='inner', indicator=True)).set_index('asin')
merged_product_review_table.reset_index(inplace = True)
merged_product_review_table = merged_product_review_table.drop(['_merge','average-rating'],axis = 1)

time_score_table = new_mv_review_table_withTime[['asin','time-weighted-overall']]
merged_product_review_table = (pd.merge(merged_product_review_table, time_score_table, on='asin', how='inner', indicator=True)).set_index('asin')
merged_product_review_table.reset_index(inplace = True)
merged_product_review_table = merged_product_review_table.drop(['_merge','asin'],axis = 1)

#filtering
filter_price = merged_product_review_table['price'] > 0
#filter_num_review = merged_product_review_table['number-of-review'] <= 10
merged_product_review_table = merged_product_review_table[filter_price]
merged_product_review_table


#normalization
product_review_table = merged_product_review_table.drop(['rank'],axis = 1)
product_review_table = (product_review_table - product_review_table.min())/(product_review_table.max()-product_review_table.min())
product_review_table['rank'] = merged_product_review_table['rank']

product_review_table


product_review_table.corr()


#splitting test size
X = product_review_table.drop("rank", axis=1)  
# copy the labels to another dataframe/series
y = product_review_table["rank"].copy()  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=1)


#Model Training
model1 = MLPClassifier(hidden_layer_sizes =(4,5,3),max_iter=1000,random_state=1)
model1.fit(X_train,y_train)

model2 = KNeighborsClassifier(n_neighbors=20)
model2.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                     weights='uniform')


predictionMLP = model1.predict(X_test)
predictionKNC = model2.predict(X_test)


print("The accuracy of model A is",accuracy_score(y_test, predictionMLP))
print("The accuracy of model B is",accuracy_score(y_test, predictionKNC))

The accuracy of model A is 0.6812674743709226
The accuracy of model B is 0.6146318732525629


fig1,ax1 = plt.subplots(figsize = (20,10))
ax1.set_title("Confusion Matrix for Multilayer Perceptron Classifier(All Beauty Product)")
conf_matrix = confusion_matrix(y_test, predictionMLP)
mlp_display = ConfusionMatrixDisplay(confusion_matrix=conf_matrix,display_labels = model1.classes_)
mlp_display.plot(cmap = 'Greens',xticks_rotation ='vertical',ax=ax1)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7fc707603690>


fig2,ax2 = plt.subplots(figsize = (20,10))
ax2.set_title("Confusion Matrix for KNeighborsClassifier(All Beauty Product)")
conf_matrix = confusion_matrix(y_test, predictionKNC)
mlp_display = ConfusionMatrixDisplay(confusion_matrix=conf_matrix,display_labels = model2.classes_)
mlp_display.plot(cmap = 'Greens',xticks_rotation ='vertical',ax=ax2)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7fc707600050>


#ROC curve
encoder = LabelEncoder()
#predict probability
y_test = encoder.fit_transform(y_test)
pred_prob1 = model1.predict_proba(X_test)[:,1]
pred_prob2 = model2.predict_proba(X_test)[:,1]
# roc curve for models
fpr1, tpr1, thresh1 = roc_curve(y_test, pred_prob1, pos_label=1)
fpr2, tpr2, thresh2 = roc_curve(y_test, pred_prob2, pos_label=1)

# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)


# matplotlib
plt.style.use('seaborn')

# plot roc curves
plt.plot(fpr1, tpr1, linestyle='--',color='orange', label='Multilayer Perceptron Classifier')
plt.plot(fpr2, tpr2, linestyle='--',color='green', label='KNN')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.legend(loc='best')
plt.savefig('ROC',dpi=300)
plt.show();


#preparing required table
partial_luxuryBeautyProduct_table = luxuryBeautyProduct_table[['asin','price']]
#merging
merged_luxuryBeautyProduct_review_table = (pd.merge(luxuryBeautyReview_table_withTime, partial_luxuryBeautyProduct_table, on='asin', how='inner', indicator=True)).set_index('asin')
merged_luxuryBeautyProduct_review_table.reset_index(inplace = True)
merged_luxuryBeautyProduct_review_table  = merged_luxuryBeautyProduct_review_table .drop(['_merge','rank','asin','average-rating'],axis = 1)
merged_luxuryBeautyProduct_review_table.rename(columns={'rank-binned':'rank'},inplace=True)

#normalization
luxuryBeautyProduct_review_table = merged_luxuryBeautyProduct_review_table.drop(['rank'],axis = 1)
luxuryBeautyProduct_review_table= (luxuryBeautyProduct_review_table - luxuryBeautyProduct_review_table.min())/(luxuryBeautyProduct_review_table.max()-luxuryBeautyProduct_review_table.min())
luxuryBeautyProduct_review_table['rank'] = merged_luxuryBeautyProduct_review_table['rank']

luxuryBeautyProduct_review_table


#splitting test size
XX = luxuryBeautyProduct_review_table.drop("rank", axis=1)  
# copy the labels to another dataframe/series
yy = luxuryBeautyProduct_review_table["rank"].copy()  
XX_train, XX_test, yy_train, yy_test = train_test_split(XX, yy, test_size=0.20,random_state=1)

#Model Training
model1.fit(XX_train,yy_train)

luxuryBeautyPredictionMLP = modelMLP.predict(XX_test)

print("The accuracy of model A is",accuracy_score(yy_test, luxuryBeautyPredictionMLP))

The accuracy of model A is 0.758781210325857


fig,ax = plt.subplots(figsize = (20,10))
ax.set_title("Confusion Matrix for Multilayer Layer Perceptron Classifier(Luxury Beauty Product)")
mlp_conf_matrix = confusion_matrix(yy_test, luxuryBeautyPredictionMLP)
display = ConfusionMatrixDisplay(confusion_matrix=mlp_conf_matrix,display_labels = modelMLP.classes_)
display.plot(cmap = 'Greens',xticks_rotation ='vertical',ax=ax)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7fc708944a50>


mx = m.copy()
mx['ratio-high-to-total'] = mx['number-of-high-rating-review']/mx['number-of-review']
mx['average-rating-range'] = pd.cut(mx['average-rating'],
                                    [0, 3, 4, 6], 
                                    right=False,
                                    labels=["[1, 3)","[3, 4)","[4, 5]"])
mx


mx.corr()


plt.title('Ratio of High Rating Reviews against the Total Number of Reviews against the Sales Rank of a Beauty Product')
plt.ylabel('Sales Rank')
plt.xlabel('Ratio of High Rating Reviews against the Total Number of Reviews')
plot=sns.scatterplot(x="ratio-high-to-total", y="rank", data=mx, hue='average-rating-range', palette=["C0","C2","C3"])


sns.set(rc={'figure.figsize':(15,10)})
sns.scatterplot(x='average-rating', y='rank', data = merged_review_table, hue='rank-binned')

<matplotlib.axes._subplots.AxesSubplot at 0x7fc70757b450>


sns.set(rc={'figure.figsize':(15,10)})
sttrplt=sns.scatterplot(x='number-of-review', y='rank', data = num_review_filter, hue='num-review-range')
plt.annotate(
# Label and coordinate
'Hard Candy Glow All The Way', xy=(110,0.86), xytext=(124, 0.9), color= "red", arrowprops={"facecolor": 'red', "shrink": 0.15}
)
plt.annotate(
# Label and coordinate
'Death and Dentistry', xy=(122,0.82), xytext=(137, 0.86), color= "red", arrowprops={"facecolor": 'red', "shrink": 0.20}
)
plt.annotate(
# Label and coordinate
'Norelco 8894 XL \nSpectra Shaver', xy=(107,0.64), xytext=(122, 0.70), color= "red", arrowprops={"facecolor": 'red', "shrink": 0.20}
)
plt.annotate(
# Label and coordinate
'My New Pink \nButton - Audry', xy=(160,0.62), xytext=(175, 0.68), color= "red", arrowprops={"facecolor": 'red', "shrink": 0.20}
)

sttrplt.set(xlabel='Total Number of Reviews', 
            ylabel='Sales Rank', 
            title="Total Number of Reviews against Sales Rank")

[Text(0, 0.5, 'Sales Rank'),
 Text(0.5, 0, 'Total Number of Reviews'),
 Text(0.5, 1.0, 'Total Number of Reviews against Sales Rank')]


plt.figure(figsize = (15,10))
price_rank_graph = sns.scatterplot(x='price', 
                                 y='rank',
                                 data=merged_price_table, hue='price-range',palette=["k", "C3","C0","C2"])

price_rank_graph .set(xlabel='Price of Beauty Product', 
                   ylabel='Sales Rank', 
                   title="Price of Beauty Products against Sales Rank")

[Text(0, 0.5, 'Sales Rank'),
 Text(0.5, 0, 'Price of Beauty Product'),
 Text(0.5, 1.0, 'Price of Beauty Products against Sales Rank')]


plt.figure(figsize = (15,10))
ax = sns.scatterplot(x='vote-score',y='rank',data=mv_review_table)

ax.set(xlabel='Vote Score', 
       ylabel='Sales Rank', 
       title="Vote Score of a Product against Sales Rank")

[Text(0, 0.5, 'Sales Rank'),
 Text(0.5, 0, 'Vote Score'),
 Text(0.5, 1.0, 'Vote Score of a Product against Sales Rank')]


timeframe = dfMergedInner[['year', 'month']].copy().groupby(['year']).count()
timeframe.rename(columns={'month':'number-of-review'},inplace=True)
timeframe.reset_index(inplace = True)
timeframe['cum-number-of-review'] = timeframe['number-of-review'].cumsum()
timeframe


plt.figure(figsize = (10,8))
ax = sns.barplot(x="year", y="number-of-review", data=timeframe, color="C9", saturation=.5)

ax.set(title='Total Number of Reviews throughout the Years',
       xlabel='Year',
       ylabel='Total Number of Beauty Product Reviews')

[Text(0, 0.5, 'Total Number of Beauty Product Reviews'),
 Text(0.5, 0, 'Year'),
 Text(0.5, 1.0, 'Total Number of Reviews throughout the Years')]


plt.figure(figsize = (10,8))
ax=sns.barplot(x="year", y="cum-number-of-review", data=timeframe, color="C9", saturation=.1)
sns.lineplot(x="year", y="cum-number-of-review", data=timeframe).plot(secondary_y=True)
ax.set(title='Cumulative Number of Reviews throughout the Years',
       xlabel='Year',
       ylabel='Cumulative Number of Beauty Product Reviews')

[Text(0, 0.5, 'Cumulative Number of Beauty Product Reviews'),
 Text(0.5, 0, 'Year'),
 Text(0.5, 1.0, 'Cumulative Number of Reviews throughout the Years')]


sns.set(rc={'figure.figsize':(20,20)})
sns.scatterplot(x='average-rating', y='rank', data = mv2_review_table, hue='also-buy-freq-range', palette=["C9","C0","C3"])

<matplotlib.axes._subplots.AxesSubplot at 0x7fc7070150d0>


sns.set(rc={'figure.figsize':(20,20)})
sns.scatterplot(x='average-rating', y='rank', data = mv3_review_table, hue='also-view-freq-range', palette=["C9","C0","C3"])

<matplotlib.axes._subplots.AxesSubplot at 0x7fc707015cd0>


fig,ax = plt.subplots(figsize = (20,10))
ax.set_title("Confusion Matrix for Multilayer Layer Perceptron Classifier(Luxury Beauty Product)")
mlp_conf_matrix = confusion_matrix(yy_test, luxuryBeautyPredictionMLP)
display = ConfusionMatrixDisplay(confusion_matrix=mlp_conf_matrix,display_labels = modelMLP.classes_)
display.plot(cmap = 'Greens',xticks_rotation ='vertical',ax=ax)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7fc706eb9e50>

	overall	verified	reviewTime	reviewerID	asin	reviewerName	reviewText	summary	unixReviewTime	vote	style	image
0	1.0	True	02 19, 2015	A1V6B6TNIC10QE	0143026860	theodore j bigham	great	One Star	1424304000	NaN	NaN	NaN
1	4.0	True	12 18, 2014	A2F5GHSXFQ0W6J	0143026860	Mary K. Byke	My husband wanted to reading about the Negro ...	... to reading about the Negro Baseball and th...	1418860800	NaN	NaN	NaN
2	4.0	True	08 10, 2014	A1572GUYS7DGSR	0143026860	David G	This book was very informative, covering all a...	Worth the Read	1407628800	NaN	NaN	NaN
3	5.0	True	03 11, 2013	A1PSGLFK1NSVO	0143026860	TamB	I am already a baseball fan and knew a bit abo...	Good Read	1362960000	NaN	NaN	NaN
4	5.0	True	12 25, 2011	A6IKXKZMTKGSC	0143026860	shoecanary	This was a good story of the Black leagues. I ...	More than facts, a good story read!	1324771200	5	NaN	NaN
5	5.0	False	02 26, 2010	A36NF437WZLQ9E	0143026860	W. Powell	Today I gave a book about the Negro Leagues of...	The Gift of Black Baseball	1267142400	4	NaN	NaN
6	4.0	False	03 7, 2001	A10Q8NIFOVOHFV	0143026860	Robert S. Clay Jr.	The story of race relations in American histor...	Baseball, America, and racism.	983923200	14	NaN	NaN
7	1.0	True	04 10, 2017	A26PO1B2Q2G1CS	014789302X	Jacqueline Diaz	I didn't like this product it smudged all unde...	One Star	1491782400	NaN	NaN	NaN
8	5.0	True	01 3, 2017	AQ812VYVTC2RJ	014789302X	Khadijah Ali-Evans	I simply love the product. I appreciate print ...	Five Stars	1483401600	NaN	NaN	NaN
9	1.0	True	11 19, 2016	AMACNEW14ADMX	014789302X	rabiyaa123	it burns your eyes when u put it on and very ...	i do not recommend.	1479513600	3	NaN	NaN

	ProductID	UserID	Review	Date
0	0143026860	A1V6B6TNIC10QE	1.0	1424304000
1	0143026860	A2F5GHSXFQ0W6J	4.0	1418860800
2	0143026860	A1572GUYS7DGSR	4.0	1407628800
3	0143026860	A1PSGLFK1NSVO	5.0	1362960000
4	0143026860	A6IKXKZMTKGSC	5.0	1324771200
5	0143026860	A36NF437WZLQ9E	5.0	1267142400
6	0143026860	A10Q8NIFOVOHFV	4.0	983923200
7	014789302X	A26PO1B2Q2G1CS	1.0	1491782400
8	014789302X	AQ812VYVTC2RJ	5.0	1483401600
9	014789302X	AMACNEW14ADMX	1.0	1479513600
10	014789302X	A5FS4FVWR77O6	2.0	1478736000
11	014789302X	A1NLF2JD7BVOB4	5.0	1474848000
12	014789302X	A3CKVF2ZR1CBO0	4.0	1470355200
13	014789302X	A3IFT6SR16SAYU	5.0	1469750400
14	014789302X	A2BQ7NB90SBVIA	5.0	1466208000
15	014789302X	A92AYWTA52KXE	5.0	1464134400
16	014789302X	AR6LNUWXJFYE6	5.0	1458950400
17	0992916305	A2V9BG2MDQVCYX	5.0	1422748800
18	0992916305	AIL9Q82L1G2YO	5.0	1422576000
19	0992916305	A1ROGN2QPKZGP7	4.0	1417392000
20	0992916305	A141Z5WI6GVM6Y	5.0	1415750400
21	0992916305	A1M3JNGXB8IU7I	5.0	1409788800
22	0992916305	A3NYLVMP922AE5	5.0	1406160000
23	1620213982	A1DN3KBTCYBH6K	5.0	1337126400
24	1620213982	A1TA38LWZO0NZS	5.0	1330214400
25	1620213982	A2OZBC5SG575G1	5.0	1326672000
26	1620213982	A2YKYIA11M9XTX	5.0	1322092800
27	1620213982	A32MUH70JIV7TS	5.0	1319846400
28	1620213982	A3S57667R6V4S4	5.0	1314403200
29	1620213982	A3JLEHAK528W8Z	5.0	1304035200
30	1620213982	A1XA71UE5A7F5B	5.0	1288483200
31	1620213982	A3BEXXPM77KR5Z	5.0	1284422400
32	1620213982	A9KCT397IWK97	5.0	1525737600
33	1620213982	A36Z95ZFDMNL99	5.0	1525737600
34	1620213982	AB6RYXLZSKHIR	5.0	1525564800
35	1620213982	A210V878QFZW9V	5.0	1525305600
36	1620213982	A18WMBY4BAB7ES	5.0	1525219200
37	1620213982	A2667UCM2FYMQ5	2.0	1525132800
38	1620213982	AQ3IBZ523E9NP	4.0	1524873600
39	1620213982	A2Q6ZCAOTIKW5A	5.0	1524873600
40	1620213982	A2HJQ1TBXTHXFQ	2.0	1524700800
41	1620213982	A33JNJ0DGA6JRU	4.0	1524700800
42	1620213982	A1KF66E65YEALF	5.0	1524700800
43	1620213982	AYYITJTFJCA77	5.0	1524614400
44	1620213982	A35GAM2X7G7LAI	5.0	1524009600
45	1620213982	A1Z8A548Z31SUB	2.0	1523923200
46	1620213982	A2XBDWQ3Z8GZM	4.0	1523923200
47	1620213982	A2FT0AFHQ32G33	5.0	1523836800
48	1620213982	AIVFUM7JDR4KM	5.0	1523664000
49	1620213982	A12GV837WV9DA2	5.0	1523491200

	overall	vote	verified	reviewTime	reviewerID	asin	reviewerName	reviewText	summary	unixReviewTime	style	image
0	2.0	3	True	06 15, 2010	A1Q6MUU0B2ZDQG	B00004U9V2	D. Poston	I bought two of these 8.5 fl oz hand cream, an...	dispensers don't work	1276560000	NaN	NaN
1	5.0	14	True	01 7, 2010	A3HO2SQDCZIE9S	B00004U9V2	chandra	Believe me, over the years I have tried many, ...	Best hand cream ever.	1262822400	NaN	NaN
2	5.0	NaN	True	04 18, 2018	A2EM03F99X3RJZ	B00004U9V2	Maureen G	Great hand lotion	Five Stars	1524009600	{'Size:': ' 3.5 oz.'}	NaN
3	5.0	NaN	True	04 18, 2018	A3Z74TDRGD0HU	B00004U9V2	Terry K	This is the best for the severely dry skin on ...	Five Stars	1524009600	{'Size:': ' 3.5 oz.'}	NaN
4	5.0	NaN	True	04 17, 2018	A2UXFNW9RTL4VM	B00004U9V2	Patricia Wood	The best non- oily hand cream ever. It heals o...	I always have a backup ready.	1523923200	{'Size:': ' 3.5 oz.'}	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...
574623	5.0	NaN	True	03 20, 2017	AHYJ78MVF4UQO	B01HIQEOLO	Lori Fox	Great color and I prefer shellac over gel	Five Stars	1489968000	NaN	NaN
574624	5.0	NaN	True	10 26, 2016	A1L2RT7KBNK02K	B01HIQEOLO	Elena	Best shellac I have ever used. It doesn't tak...	Best shellac I have ever used	1477440000	NaN	NaN
574625	5.0	NaN	True	09 30, 2016	A36MLXQX9WPPW9	B01HIQEOLO	Donna D. Harris	Great polish and beautiful color!!	Great polish!	1475193600	NaN	NaN
574626	1.0	2	True	12 5, 2016	A23DRCOMC2RIXF	B01HJ2UY0W	Y.Y. Chen	The perfume is good, but the spray head broke ...	Spray head broke off within a month	1480896000	{'Size:': ' 1.7 Fluid Ounce', 'Color:': ' Multi'}	NaN
574627	5.0	NaN	True	01 14, 2017	AJEDVHTLS9P3V	B01HJ2UY1G	ML Shelton	Great fragrance.	Five Stars	1484352000	{'Size:': ' 3.4 Fluid Ounce', 'Color:': ' Multi'}	NaN

	category	tech1	description	fit	title	also_buy	image	tech2	brand	feature	rank	also_view	details	main_cat	similar_item	date	price	asin
0	[]		[After a long day of handling thorny situation...		Crabtree & Evelyn - Gardener's Ultra-Moist...	[B00GHX7H0A, B00FRERO7G, B00R68QXCS, B000Z65AZ...	[https://images-na.ssl-images-amazon.com/image...			[]	4,324 in Beauty & Personal Care (	[B00FRERO7G, B00GHX7H0A, B07GFHJRMX, B00TJ3NBN...	{' Product Dimensions: ': '2.2 x 2.2 ...	Luxury Beauty			$30.00	B00004U9V2
1	[]		[If you haven't experienced the pleasures of b...		AHAVA Bath Salts	[]	[]			[]	1,633,549 in Beauty & Personal Care (	[]	{' Product Dimensions: ': '3 x 3.5 x ...	Luxury Beauty				B0000531EN
2	[]		[Rich, black mineral mud, harvested from the b...		AHAVA Dead Sea Mineral Mud, 8.5 oz, Pack of 4	[]	[https://images-na.ssl-images-amazon.com/image...			[]	1,806,710 in Beauty & Personal Care (	[]	{' Product Dimensions: ': '5.1 x 3 x ...	Luxury Beauty				B0000532JH
3	[]		[This liquid soap with convenient pump dispens...		Crabtree & Evelyn Hand Soap, Gardeners, 10...	[]	[https://images-na.ssl-images-amazon.com/image...			[]	[]	[B00004U9V2, B00GHX7H0A, B00FRERO7G, B00R68QXC...	{' Product Dimensions: ': '2.6 x 2.6 ...	Luxury Beauty			$15.99	B00005A77F
4	[]		[Remember why you love your favorite blanket? ...		Soy Milk Hand Crme	[B000NZT6KM, B001BY229Q, B008J724QY, B0009YGKJ...	[https://images-na.ssl-images-amazon.com/image...			[]	42,464 in Beauty & Personal Care (	[]	{' Product Dimensions: ': '7.2 x 2.2 ...	Luxury Beauty			$18.00	B00005NDTD
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
12294	[]		[, CND Craft Culture Collection: Patina Buckle...		CND Shellac Power Polish, Patina Buckle	[B003ONLAXQ, B00YDEZ9T6, B074KHRD13, B00R3PZK1...	[]			[]	88,740 in Beauty & Personal Care (	[B00D2VMUA2, B074KJZJYW, B074KHRD13, B073SB9JW...	{' Item Weight: ': '0.48 ounces', 'Sh...	Luxury Beauty			$15.95	B01HIQIEYC
12295	[]		[CND Shellac was designed to be used as a syst...		CND Shellac power polish denim patch	[B003ONLAXQ, B003OH0KBA, B004LEMWGG, B01MT91G4...	[]			[]	122,331 in Beauty & Personal Care (	[B00D2VMUA2, B01L0EV8X2, B004LEMWGG, B00EFGDYZ...	{'Shipping Weight:': '1.4 ounces (', 'ASIN:': ...	Luxury Beauty			$15.95	B01HIQHQU0
12296	[]		[CND Shellac was designed to be used as a syst...		CND Shellac, Leather Satchel	[B003ONLAXQ, B003OH0KBA, B004LEMWGG, B01MT91G4...	[https://images-na.ssl-images-amazon.com/image...			[]	168,028 in Beauty & Personal Care (	[B00D2VMUA2, B01L0EV8X2, B004LEMWGG, B00EFGDYZ...	{'Shipping Weight:': '1.4 ounces (', 'Domestic...	Luxury Beauty			$15.95	B01HIQEOLO
12297	[]		[The I AM JUICY COUTURE girl is once again tak...		Juicy Couture I Love Juicy Couture, 1.7 fl. Oz...	[]	[https://images-na.ssl-images-amazon.com/image...			[]	490,755 in Beauty & Personal Care (	[B0757439SY, B01HJ2UY1G, B01KX3TK7C, B01LX71LJ...	{' Product Dimensions: ': '3.3 x 2.7 ...	Luxury Beauty			$76.00	B01HJ2UY0W
12298	[]		[I Love Juicy Couture Eau De Parfum Spray 3.4 ...		Juicy Couture I Love Juicy Couture, 3.4 fl. Oz...	[B071NZZW3K]	[https://images-na.ssl-images-amazon.com/image...			[]	181,383 in Beauty & Personal Care (	[B0757439SY, B01LX71LJV, B01HJ2UY0W, B07GBSC3L...	{' Product Dimensions: ': '3.3 x 3.2 ...	Luxury Beauty			$96.00	B01HJ2UY1G

	year	month	day	overall	verified	reviewerID	asin	unixReviewTime	vote
0	2015	02	19	1.0	True	A1V6B6TNIC10QE	0143026860	1424304000	0
1	2014	12	18	4.0	True	A2F5GHSXFQ0W6J	0143026860	1418860800	0
2	2014	08	10	4.0	True	A1572GUYS7DGSR	0143026860	1407628800	0
3	2013	03	11	5.0	True	A1PSGLFK1NSVO	0143026860	1362960000	0
4	2011	12	25	5.0	True	A6IKXKZMTKGSC	0143026860	1324771200	5
...	...	...	...	...	...	...	...	...	...
362398	2017	07	20	1.0	True	A202DCI7TV1022	B01HJEGTYK	1500508800	0
362399	2017	03	16	5.0	True	A3FSOR5IJOFIBE	B01HJEGTYK	1489622400	34
362400	2017	03	1	5.0	True	A1B5DK6CTP2P24	B01HJEGTYK	1488326400	46
362401	2017	02	21	2.0	True	A23OUYS5IRMJS9	B01HJEGTYK	1487635200	0
362402	2016	12	15	2.0	True	A24KQ9RVU81L87	B01HJEGTYK	1481760000	0

	category	description	title	also_buy	image	brand	feature	rank	also_view	details	main_cat	similar_item	price	asin
0	[]	[Loud 'N Clear Personal Sound Amplifier allows...	Loud 'N Clear™ Personal Sound Amplifier	[]	[]	idea village	[]	2,938,573 in Beauty & Personal Care (	[]	{'ASIN: ': '6546546450'}	All Beauty			6546546450
1	[]	[No7 Lift & Luminate Triple Action Serum 50ml ...	No7 Lift & Luminate Triple Action Serum 50...	[B01E7LCSL6, B008X5RVME]	[]		[]	872,854 in Beauty & Personal Care (	[]	{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...	All Beauty	class="a-bordered a-horizontal-stripes a-spa...	$44.99	7178680776
2	[]	[No7 Stay Perfect Foundation now stays perfect...	No7 Stay Perfect Foundation Cool Vanilla by No7	[]	[]	No7	[]	956,696 in Beauty & Personal Care (	[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]	{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...	All Beauty		$28.76	7250468162
3	[]	[]	Wella Koleston Perfect Hair Colour 44/44 Mediu...	[B0041PBXX8]	[https://images-na.ssl-images-amazon.com/image...		[]	1,870,258 in Beauty & Personal Care (	[]	{' Item Weight: ': '1.76 ounces', 'Sh...	All Beauty			7367905066
4	[]	[Lacto Calamine Skin Balance Daily Nourishing ...	Lacto Calamine Skin Balance Oil control 120 ml...	[]	[https://images-na.ssl-images-amazon.com/image...	Pirmal Healthcare	[]	67,701 in Beauty & Personal Care (	[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...	{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...	All Beauty		$12.15	7414204790
5	[]	[Mary Kay Satin Hands Peach Hand Cream Travel ...	Mary Kay Satin Hands Hand Cream Travel MINI Si...	[]	[]	Mary Kay	[]	699,317 in Beauty & Personal Care (	[]	{'Shipping Weight:': '2.4 ounces', 'ASIN: ': '...	All Beauty			7535842801
6	[]	[]	Unique Custom Cast Iron Liner Shader Tattoo Ma...	[]	[]		[]	3,206,658 in Beauty & Personal Care (	[]	{'ASIN: ': '7538626107'}	All Beauty			7538626107
7	[]	[According to the legend, in 1613, two brother...	Imagen Bendita Por Su Santidad Our Lady of Cha...	[]	[]	Chango	[]	1,741,858 in Beauty & Personal Care (	[]	{' Item Weight: ': '8 ounces', 'Shipp...	All Beauty			8279996397
8	[]	[Novi prevod proslavljene knjige Zadruga objav...	Mali princ	[]	[https://images-na.ssl-images-amazon.com/image...	Srpska knjizevna zadruga	[]	2,136,432 in Beauty & Personal Care (	[]	{'Shipping Weight:': '7.7 ounces', 'ASIN: ': '...	All Beauty			8637910351
9	[]	[Moleskine's Card Wallet in Payne's Grey is th...	Moleskine Payne's Grey Card Wallet (Moleskine ...	[]	[https://images-na.ssl-images-amazon.com/image...	Sunatoria	[]	1,260,339 in Beauty & Personal Care (	[]	{' Product Dimensions: ': '2.8 x 4.3 ...	All Beauty			8867326759

	year	month	day	overall	vote	verified	reviewerID	asin	unixReviewTime
0	2010	06	15	2.0	3	True	A1Q6MUU0B2ZDQG	B00004U9V2	1276560000
1	2010	01	7	5.0	14	True	A3HO2SQDCZIE9S	B00004U9V2	1262822400
2	2018	04	18	5.0	0	True	A2EM03F99X3RJZ	B00004U9V2	1524009600
3	2018	04	18	5.0	0	True	A3Z74TDRGD0HU	B00004U9V2	1524009600
4	2018	04	17	5.0	0	True	A2UXFNW9RTL4VM	B00004U9V2	1523923200
...	...	...	...	...	...	...	...	...	...
538441	2017	03	20	5.0	0	True	AHYJ78MVF4UQO	B01HIQEOLO	1489968000
538442	2016	10	26	5.0	0	True	A1L2RT7KBNK02K	B01HIQEOLO	1477440000
538443	2016	09	30	5.0	0	True	A36MLXQX9WPPW9	B01HIQEOLO	1475193600
538444	2016	12	5	1.0	2	True	A23DRCOMC2RIXF	B01HJ2UY0W	1480896000
538445	2017	01	14	5.0	0	True	AJEDVHTLS9P3V	B01HJ2UY1G	1484352000

	title	also_buy	rank	also_view	price	asin
0	Crabtree & Evelyn - Gardener's Ultra-Moist...	[B00GHX7H0A, B00FRERO7G, B00R68QXCS, B000Z65AZ...	4324	[B00FRERO7G, B00GHX7H0A, B07GFHJRMX, B00TJ3NBN...	30.00	B00004U9V2
1	AHAVA Bath Salts	[]	1633549	[]	0.00	B0000531EN
2	AHAVA Dead Sea Mineral Mud, 8.5 oz, Pack of 4	[]	1806710	[]	0.00	B0000532JH
3	Crabtree & Evelyn Hand Soap, Gardeners, 10...	[]	0	[B00004U9V2, B00GHX7H0A, B00FRERO7G, B00R68QXC...	15.99	B00005A77F
4	Soy Milk Hand Crme	[B000NZT6KM, B001BY229Q, B008J724QY, B0009YGKJ...	42464	[]	18.00	B00005NDTD
...	...	...	...	...	...	...
12106	CND Shellac Power Polish, Patina Buckle	[B003ONLAXQ, B00YDEZ9T6, B074KHRD13, B00R3PZK1...	88740	[B00D2VMUA2, B074KJZJYW, B074KHRD13, B073SB9JW...	15.95	B01HIQIEYC
12107	CND Shellac power polish denim patch	[B003ONLAXQ, B003OH0KBA, B004LEMWGG, B01MT91G4...	122331	[B00D2VMUA2, B01L0EV8X2, B004LEMWGG, B00EFGDYZ...	15.95	B01HIQHQU0
12108	CND Shellac, Leather Satchel	[B003ONLAXQ, B003OH0KBA, B004LEMWGG, B01MT91G4...	168028	[B00D2VMUA2, B01L0EV8X2, B004LEMWGG, B00EFGDYZ...	15.95	B01HIQEOLO
12109	Juicy Couture I Love Juicy Couture, 1.7 fl. Oz...	[]	490755	[B0757439SY, B01HJ2UY1G, B01KX3TK7C, B01LX71LJ...	76.00	B01HJ2UY0W
12110	Juicy Couture I Love Juicy Couture, 3.4 fl. Oz...	[B071NZZW3K]	181383	[B0757439SY, B01LX71LJV, B01HJ2UY0W, B07GBSC3L...	96.00	B01HJ2UY1G

	title	also_buy	rank	also_view	price	asin
0	Loud 'N Clear™ Personal Sound Amplifier	[]	0.844068	[]	0.00	6546546450
1	No7 Lift & Luminate Triple Action Serum 50...	[B01E7LCSL6, B008X5RVME]	0.250716	[]	44.99	7178680776
2	No7 Stay Perfect Foundation Cool Vanilla by No7	[]	0.274799	[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]	28.76	7250468162
3	Wella Koleston Perfect Hair Colour 44/44 Mediu...	[B0041PBXX8]	0.537208	[]	0.00	7367905066
4	Lacto Calamine Skin Balance Oil control 120 ml...	[]	0.019446	[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...	12.15	7414204790
...	...	...	...	...	...	...
32483	Barielle Pro Textured Grip Cuticle Nipper, Purple	[]	0.616217	[]	9.95	B01HIWLLUK
32484	(Buy 3 Get 1 Free) Salon Perfect Eye Makeup Co...	[]	0.470987	[]	0.00	B01HJ1K3YK
32485	NOW D-Mannose 500 mg - 120 Veg Capsules (Pack ...	[B01KON9B4S, B079X3YFXS, B00M79OYS6, B000JN4CR...	0.059576	[B01KON9B4S, B000JN4CR0, B071ZHMRHS, B01HJ84TN...	55.63	B01HJ84SGM
32486	12 White Feather Shuttlecocks Birdies Badminto...	[]	0.277377	[]	12.99	B01HJASD20
32487	FESHFEN Scrunchy Scrunchies Synthetic Hair Bun...	[B01HJEGQ6Q, B01L66EYCG, B00H7YZ7HI, B00CHKKWZ...	0.137335	[B06XNM7CGD, B071D7TMQS]	0.00	B01HJEGTYK

	title	also_buy	rank	also_view	price	year	month	day	overall	verified	reviewerID	unixReviewTime	vote	_merge
asin
B000NICBSI	Camille Beckman Glycerin Hand Therapy, English...	[]	0.450576	[]	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	left_only
B01HHYC4UA	BabyMax Professional Tourmaline Ceramic Automa...	[]	0.363253	[]	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	left_only

	asin	rank	average-rating	number-of-review	average-verified-rating
0	6546546450	0.844068	2.5	2	4.000000
1	7178680776	0.250716	3.0	1	3.000000
2	7250468162	0.274799	5.0	1	5.000000
3	7367905066	0.537208	5.0	1	5.000000
4	7414204790	0.019446	4.4	15	4.461538
...	...	...	...	...	...
32480	B01HIWLLUK	0.616217	5.0	1	5.000000
32481	B01HJ1K3YK	0.470987	1.0	1	1.000000
32482	B01HJ84SGM	0.059576	5.0	5	5.000000
32483	B01HJASD20	0.277377	1.0	1	1.000000
32484	B01HJEGTYK	0.137335	3.0	10	3.000000

	rank	average-rating	number-of-review
rank	1.000000	-0.062927	-0.120137
average-rating	-0.062927	1.000000	0.006235
number-of-review	-0.120137	0.006235	1.000000

	asin	max	min	diff
0	6546546450	2013	2009	5
1	7178680776	2017	2017	1
2	7250468162	2017	2017	1
3	7367905066	2017	2017	1
4	7414204790	2018	2014	5
...	...	...	...	...
32480	B01HIWLLUK	2016	2016	1
32481	B01HJ1K3YK	2017	2017	1
32482	B01HJ84SGM	2018	2017	2
32483	B01HJASD20	2016	2016	1
32484	B01HJEGTYK	2018	2016	3

	asin	rank	average-rating	number-of-review
0	B00004U9V2	4324	4.616462	571
1	B0000531EN	1633549	4.500000	6
2	B0000532JH	1806710	3.125000	8
4	B00005NDTD	42464	4.862500	80
5	B00005R7ZZ	1527650	4.600000	5
...	...	...	...	...
11904	B01HIQIEYC	88740	4.360000	25
11905	B01HIQHQU0	122331	4.500000	14
11906	B01HIQEOLO	168028	5.000000	11
11907	B01HJ2UY0W	490755	1.000000	1
11908	B01HJ2UY1G	181383	5.000000	1

	asin	rank	average-rating	number-of-review	rank-binned	average-rating-range
0	6546546450	0.844068	2.5	2	Bad	[1, 3)
1	7178680776	0.250716	3.0	1	Good	[3, 4)
2	7250468162	0.274799	5.0	1	Good	[4, 5]
3	7367905066	0.537208	5.0	1	Moderate	[4, 5]
4	7414204790	0.019446	4.4	15	Very Good	[4, 5]
...	...	...	...	...	...	...
32480	B01HIWLLUK	0.616217	5.0	1	Moderate	[4, 5]
32481	B01HJ1K3YK	0.470987	1.0	1	Good	[1, 3)
32482	B01HJ84SGM	0.059576	5.0	5	Very Good	[4, 5]
32483	B01HJASD20	0.277377	1.0	1	Good	[1, 3)
32484	B01HJEGTYK	0.137335	3.0	10	Very Good	[3, 4)

	asin	number-of-review	rank
8135	B00BXPVIT8	13	0.000010
5863	B006IB5T4W	2790	0.000019
11623	B00HYEPVG8	34	0.000070
3201	B001QY8QXM	4543	0.000080
77	B00005JS5C	2249	0.000131
...	...	...	...
18734	B00TBTBJLA	1	0.995547
23707	B015X6LH8E	1	0.996050
31228	B01FWLYTG2	1	0.996580
3019	B001LNW3ZK	2	0.996847
2948	B001JYXGC0	1	1.000000

	asin	number-of-review	rank	num-review-range
8135	B00BXPVIT8	13	0.000010	(10,200)
11623	B00HYEPVG8	34	0.000070	(10,200)
8137	B00BXR0SFG	2	0.000245	[1,4]
29417	B01DJI7796	123	0.000292	(10,200)
28053	B01C39X6TW	143	0.000313	(10,200)
...	...	...	...	...
18734	B00TBTBJLA	1	0.995547	[1,4]
23707	B015X6LH8E	1	0.996050	[1,4]
31228	B01FWLYTG2	1	0.996580	[1,4]
3019	B001LNW3ZK	2	0.996847	[1,4]
2948	B001JYXGC0	1	1.000000	[1,4]

	year	asin	rank	count	mean
43	2000	B000050B60	0.868033	1	5.0
46	2000	B000050B69	0.880707	3	5.0
48	2000	B000050B62	0.965073	1	5.0
94	2000	B000050FED	0.901629	1	3.0
80	2000	B000050B64	0.925721	1	5.0
...	...	...	...	...	...
2214	2018	B000B658RI	0.192526	1	5.0
46770	2018	B013XFYYD2	0.115749	2	4.0
46776	2018	B013XJET9M	0.149915	1	2.0
46710	2018	B013V9RQEE	0.245075	1	5.0
62405	2018	B01HJEGTYK	0.137335	1	2.0

	year	asin	rank	count	mean	nrank
0	2017	B00BXPVIT8	0.000010	3	4.666667	1
1	2018	B00BXPVIT8	0.000010	3	5.000000	2
2	2014	B00BXPVIT8	0.000010	2	4.500000	3
3	2016	B00BXPVIT8	0.000010	4	5.000000	4
4	2015	B00BXPVIT8	0.000010	1	5.000000	5
...	...	...	...	...	...	...
62401	2016	B015X6LH8E	0.996050	1	2.000000	62402
62402	2016	B01FWLYTG2	0.996580	1	1.000000	62403
62403	2008	B001LNW3ZK	0.996847	1	5.000000	62404
62404	2014	B001LNW3ZK	0.996847	1	5.000000	62405
62405	2009	B001JYXGC0	1.000000	1	5.000000	62406

	asin	rank	average-rating	number-of-review	rank-binned	upper-rank-binned
11444	B00HNXZ5A2	0.641389	1.0	1	Moderate	3
2102	B0013NFY8G	0.342832	1.0	3	Good	2
24108	B016NLMDHW	0.231473	1.0	1	Very Good	1
5003	B004X2HNR4	0.944680	1.0	1	Bad	4
2109	B0013THRI0	0.770144	1.0	1	Bad	4
...	...	...	...	...	...	...
18196	B00S1KYWBE	0.459015	5.0	1	Good	2
18199	B00S1RKTS2	0.427705	5.0	1	Good	2
18202	B00S20MNDW	0.553129	5.0	1	Moderate	3
3489	B0028XO2TW	0.194477	5.0	2	Very Good	1
19427	B00USBJOQW	0.509694	5.0	1	Moderate	3

	asin	rank	average-rating	number-of-review	rank-binned	upper-rank-binned	average-rating-range
4	7414204790	0.019446	4.400000	15	Very Good	1	[4, 5]
5	7535842801	0.200870	4.000000	23	Very Good	1	[4, 5]
17	9742121109	0.122233	4.333333	3	Very Good	1	[4, 5]
20	9790787006	0.085853	4.228070	57	Very Good	1	[4, 5]
26	B000050B6B	0.042678	4.441558	154	Very Good	1	[4, 5]

	asin	rank	price	number-of-review
7970	B00BXPVIT8	0.000010	21.44	13
5717	B006IB5T4W	0.000019	11.29	2790
11426	B00HYEPVG8	0.000070	9.59	34
3091	B001QY8QXM	0.000080	9.30	4543
68	B00005JS5C	0.000131	18.49	2249
...	...	...	...	...
18486	B00TBTBJLA	0.995547	0.00	1
23407	B015X6LH8E	0.996050	50.60	1
30844	B01FWLYTG2	0.996580	0.00	1
2911	B001LNW3ZK	0.996847	0.00	2
2840	B001JYXGC0	1.000000	0.00	1

	rank	price	number-of-review
rank	1.000000	-0.232641	-0.420948
price	-0.232641	1.000000	0.024136
number-of-review	-0.420948	0.024136	1.000000

	asin	rank	average-rating	number-of-review	average-verified-rating	vote-score
0	6546546450	0.844068	2.500000	2	4.000000	1.000000
4	7414204790	0.019446	4.400000	15	4.461538	4.000000
5	7535842801	0.200870	4.000000	23	4.000000	3.181818
19	9752980953	0.667839	1.000000	1	1.000000	1.000000
20	9790787006	0.085853	4.228070	57	4.116279	5.000000
...	...	...	...	...	...	...
32067	B01HI74BV6	0.102599	4.750000	4	4.750000	4.555556
32075	B01HIPOQ2M	0.059379	3.878049	41	3.850000	3.809524
32077	B01HIWKGOM	0.087421	5.000000	1	5.000000	5.000000
32081	B01HJASD20	0.277377	1.000000	1	1.000000	1.000000
32082	B01HJEGTYK	0.137335	3.000000	10	3.000000	4.809524

	rank	average-rating	number-of-review	average-verified-rating	vote-score	time-weighted-overall	price
count	32083.000000	32083.000000	32083.000000	32083.000000	32083.000000	32083.000000	32083.000000
mean	0.367673	4.039871	11.007107	3.825636	1.010553	3.397758	9.624540
std	0.231363	1.142294	91.980093	1.433336	1.833135	1.062432	34.267666
min	0.000010	1.000000	1.000000	0.000000	0.000000	0.156530	0.000000
25%	0.194328	3.500000	1.000000	3.000000	0.000000	2.782916	0.000000
50%	0.324216	4.428571	2.000000	4.333333	0.000000	3.634031	0.000000
75%	0.499864	5.000000	5.000000	5.000000	1.000000	4.244692	9.570000
max	1.000000	5.000000	8671.000000	5.000000	5.000000	4.981671	2149.000000

	also_buy	asin	frequency
1	B01E7LCSL6	7178680776	1
2	B008X5RVME	7178680776	1
4	B0041PBXX8	7367905066	1
18	B00IWI42GU	9742121109	1
19	B005MZPP6M	9742121109	1
...	...	...	...
128976	B011S54ES4	B01HJEGTYK	1
128977	B01DI91VCA	B01HJEGTYK	1
128978	B00CHKKWVI	B01HJEGTYK	1
128979	B0713RP29K	B01HJEGTYK	1
128980	B00P2SZBWM	B01HJEGTYK	1

	also_buy	asin	frequency
0	B001E96P6O	B000052YAN	1
1	B0122ZKPHI	B000052YAN	1
2	B0013NB7DW	B0000530HU	1
3	B00UY2PRXE	B0000Y3DD2	1
4	B00EW7J1HK	B0000Y8IOQ	1
...	...	...	...
2334	B01EBIY7HS	B01HG1BF9A	1
2335	B01E82O696	B01HG1BF9A	1
2336	B013ISARHG	B01HHAWG20	1
2337	B01DNS19J6	B01HHLKUCC	1
2338	B01D6JFM04	B01HIH2QTU	1

	support	itemsets
0	0.022013	(B007Z5RTRK)
1	0.015723	(B00C4207LY)
2	0.010220	(B00ZGWVZB0)
3	0.018082	(B017CHORY8)
4	0.016509	(B018J05XSQ)
5	0.011792	(B007Z5RTRK, B018J05XSQ)

	asin	also_view	frequency
2	7250468162	B01B8BR0O8	1
3	7250468162	B01B8BR0NO	1
4	7250468162	B014MHXXM8	1
6	7414204790	3254895630	1
7	7414204790	B007VL1D9S	1
...	...	...	...
143314	B01HJ84SGM	B079X3YFXS	1
143315	B01HJ84SGM	B005DIBEXG	1
143316	B01HJ84SGM	B000HCMK90	1
143318	B01HJEGTYK	B06XNM7CGD	1
143319	B01HJEGTYK	B071D7TMQS	1

Project Part 2: Amazon Beauty Products¶

1.0 Introduction¶

2.0 Loading Datasets¶

2.1 meta_All_Beauty.json¶

2.2 Review_Beauty.json¶

2.3 All_Beauty.csv¶

2.4 Luxury_Beauty.json¶

2.5 meta_Luxury_Beauty.json¶

3.0 Data Cleaning¶

3.1 Cleaning meta_All_Beauty.json¶

3.2 Cleaning Review_Beauty.json¶

3.3 Cleaning Luxury_Beauty.json¶

3.4 Cleaning meta_Luxury_Beauty.json¶

4.0 Data Transformation¶

4.1 Normalization¶

4.2 Merging¶

4.2.1 Merging review_table and product_table¶

4.2.2 Final Merged Table¶

4.3 Useful Functions¶

4.3.1 process_merged_review¶

4.3.2 process_verified_review¶

4.3.3 time frame¶

4.3.4 Calculate Density¶

4.4 Merging For Luxury Beauty¶

4.4.1 Final_luxuryBeautyReview_table¶

4.4.2 merged_luxuryBeautyReview_table¶

4.4.3 mv_luxuryBeautyReview_table¶

4.4.4 Process for final luxury beauty table¶

4.5 Binning¶

4.5.1 Binning for All Beauty Product¶

4.5.2 Binning for Luxury Beauty Product¶

5.0 Data Analysis¶

5.1 Corr: avg rating && sales rank¶

5.2 Corr: num review && sales rank¶

5.2.1 num review 4 bins¶

5.2.2 num of high/low rating¶

5.2.3 correlation at different year (2000-2018)¶

5.3 Analysing "Very Good" Products¶

5.4 Slice Sales Rank < 5000¶

5.5 Price VS Sales Rank¶

5.6 Analysing Review Upvotes¶

5.7 Analysing Time-Weighted-Average¶

5.8 Bivariate Attribute Analysis¶

5.8.1 Sales Rank¶

5.8.2 Average Rating¶

5.8.3 Average Verified Rating¶

5.8.4 Number of Reviews¶

5.8.5 Vote Score¶

5.8.6 Time Weighted Overall¶

5.8.7 Price¶

5.9 Correlation of All Attributes¶

6.0 Data Mining¶

6.1 Association Rule Mining (ARM)¶

6.1.1 Also Buy Products¶

6.1.3 Also View Products¶

6.1.3 Also Buy and Also View Frequency Analysis¶

6.2 Predictive Modeling¶

6.2.1 Predictive Model for All Beauty Product's Sales Rank¶

6.2.2 Predictive Model for Luxury Beauty Product's Sales Rank¶

7.0 Data Visualization¶

7.1 Number of High Ratings VS Sales Rank¶

7.2 Average Rating VS Sales Rank¶

7.3 Number of Reviews VS Sales Rank¶

7.4 Price V.S. Sales Rank¶

7.5 Review Up Votes V.S. Sales Rank¶

7.6 Timeframe and Reviews¶

7.7 Also Buy and Also View of the Product Reviews¶

7.8 Prediction Accuracy for Luxury Beauty Products¶

Conclusion¶

	asin	also_view	frequency
0	B000050B6B	B00009RB1C	1
1	B000050B6H	B0001HYLR0	1
2	B000052YAN	B001E96P6O	1
3	B000052YAN	B001F51RAG	1
4	B0000533AX	B001F51RAG	1
...	...	...	...
1773	B01HCCP4GS	B018OB5466	1
1774	B01HE4QV52	B01E1QXTEW	1
1775	B01HFLLNXE	B01CWJHZW2	1
1776	B01HI1YKIG	B00M2J4HXI	1
1777	B01HI1YKIG	B00JCSLA6O	1

	asin	also-buy-freq
0	B000052YAN	2
1	B00005JS5C	1
2	B00012FL8I	3
3	B000142J2W	1
4	B00015WOFS	1
...	...	...
1163	B01HEEJOQA	1
1164	B01HFUTUBC	1
1165	B01HG1BF9A	3
1166	B01HHAWG20	1
1167	B01HIHLFOC	1

	asin	also-view-freq
0	B000050B6H	1
1	B000068PBM	1
2	B00009RB0Y	2
3	B00009RB1C	2
4	B0000C4KJV	5
...	...	...
1047	B01HE4QV52	1
1048	B01HFWIKH0	1
1049	B01HH1CDR2	1
1050	B01HHWQX8G	1
1051	B01HI1YKIG	1

	number-of-review	price	average-verified-rating	time-weighted-overall	rank
1	0.000000	0.020931	0.500000	0.538948	Good
2	0.000000	0.013378	1.000000	0.938010	Good
4	0.003082	0.005649	0.865385	0.753091	Very Good
11	0.000000	0.011624	0.000000	0.131720	Good
18	0.000000	0.010209	0.000000	0.072026	Moderate
...	...	...	...	...	...
30422	0.008807	0.002550	0.712500	0.691603	Very Good
30423	0.000881	0.009646	0.800000	0.750599	Very Good
30425	0.000000	0.004625	1.000000	0.882909	Moderate
30427	0.000881	0.025882	1.000000	0.939552	Very Good
30428	0.000000	0.006040	0.000000	0.120010	Good

	number-of-review	price	average-verified-rating	time-weighted-overall
number-of-review	1.000000	-0.029867	0.004268	-0.020043
price	-0.029867	1.000000	0.038019	0.028474
average-verified-rating	0.004268	0.038019	1.000000	0.922421
time-weighted-overall	-0.020043	0.028474	0.922421	1.000000

	number-of-review	average-verified-rating	time-weighted-overall	price	rank
0	0.164407	0.925969	0.736299	0.016667	Very Good
1	0.001442	0.800000	0.224920	0.000000	Bad
2	0.002019	0.633333	0.169006	0.000000	Bad
3	0.022786	0.974026	0.759892	0.010000	Very Good
4	0.001154	0.866667	0.370858	0.000000	Moderate
...	...	...	...	...	...
11808	0.006922	0.866667	0.767151	0.008861	Very Good
11809	0.003750	0.900000	0.809123	0.008861	Very Good
11810	0.002884	1.000000	0.898717	0.008861	Very Good
11811	0.000000	0.200000	0.059516	0.042222	Very Good
11812	0.000000	1.000000	0.872190	0.053333	Very Good

	rank	average-rating	number-of-review	number-of-high-rating-review	number-of-low-rating-review	ratio-high-to-total
rank	1.000000	-0.065697	-0.419714	-0.414280	-0.295049	-0.064359
average-rating	-0.065697	1.000000	-0.011613	0.134331	-0.317108	0.930976
number-of-review	-0.419714	-0.011613	1.000000	0.954098	0.772807	-0.008575
number-of-high-rating-review	-0.414280	0.134331	0.954098	1.000000	0.547263	0.142145
number-of-low-rating-review	-0.295049	-0.317108	0.772807	0.547263	1.000000	-0.325177
ratio-high-to-total	-0.064359	0.930976	-0.008575	0.142145	-0.325177	1.000000

	year	number-of-review	cum-number-of-review
0	2000	10	10
1	2001	65	75
2	2002	77	152
3	2003	118	270
4	2004	290	560
5	2005	559	1119
6	2006	943	2062
7	2007	3048	5110
8	2008	3610	8720
9	2009	3940	12660
10	2010	4951	17611
11	2011	6373	23984
12	2012	9179	33163
13	2013	25278	58441
14	2014	37277	95718
15	2015	65817	161535
16	2016	97984	259519
17	2017	67940	327459
18	2018	29818	357277