#!/usr/bin/env python # coding: utf-8 # In[1]: #Importing Libraries import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns #Handling the Dataset. Inpath= "C:/Users/kelli/Documents/DCU/Y4/Mt-412 Analytics Portfolio/" Airbnb_Df=pd.read_excel(Inpath+"Listings Power BI Assignmnet.xlsx" ,header=0, index_col=0) Airbnb_Df # In[2]: #Describe the dataset print(Airbnb_Df.head()) print(Airbnb_Df.tail()) print(Airbnb_Df.shape) # In[3]: Airbnb_Df.describe(include="all") # In[4]: #Handling missing / Nan values (Airbnb_Df.isna().sum(axis=0)) print(Airbnb_Df.isna().sum(axis=0)/len(Airbnb_Df)) # In[5]: #As we will not be using the values within district and the null values were so high we dropped the column. Airbnb_Df.drop("district", axis=1,inplace=True) Airbnb_Df # In[6]: Airbnb_Df.describe(include="all") # In[7]: #Checking that the column has been dropped from the dataframe. list(Airbnb_Df.columns) # In[8]: # After checking that the null values were evenly distributed by country we decided to drop the rows with null values for the reviews. Airbnb_Df.dropna(subset=["review_scores_rating","review_scores_accuracy","review_scores_cleanliness","review_scores_checkin", "review_scores_communication", "review_scores_location", "review_scores_value"], inplace=True) Airbnb_Df # In[9]: #Handling missing / Nan values (Airbnb_Df.isna().sum(axis=0)) print(Airbnb_Df.isna().sum(axis=0)/len(Airbnb_Df)) # In[10]: # We chose to keep the columns host_response_time, host_response_rate, host_acceptance_rate as we would loose too much valuable data. # We will use filters to drop null values within power bi when specifically looking at these measures. # In[ ]: Airbnb_Df.describe(include="all") # In[ ]: #Checking for duplicated rows, deleting them while keeping the first instance. #print("There are " + str(Airbnb_Df.duplicated().sum()) + " duplicated Rows") Airbnb_Df[Airbnb_Df.index.duplicated()] #this resulted in no duplicates found. This could be either due to previous cleaning or the high quaility of the original data. # In[11]: # Investigating the price to check for outliers #Examining through percentiles. percentiles = [25, 50, 75, 90, 95] percentiles.extend([i/10 for i in range(990, 1000, 1)] + [100]) # Calculate the percentiles of the 'price' column price_percentiles = np.percentile(Airbnb_Df['price in €'], percentiles) # Print the calculated percentiles for percentile, value in zip(percentiles, price_percentiles): print(f'{percentile}th Percentile: {value:.2f}') #While we found that there were a few outliers, we manually investigated them and concluded that they were true outliers and that they accurately reflected the range in price for each city. #We also invesigated these outliers by removing them and found that the mean price per city was minimally affected and so decided to keep the outliers to reduce unnecessary data loss. # In[12]: #Using One-Hot Encoding on the nominal variable "Amenities" Airbnb_Df = pd.get_dummies(Airbnb_Df, columns=['amenities'], prefix='amenities_provided') #While we manually filtered the amenities within Power BI, this is an alternative way to filter out each amenity, which is much quicker than manually filtering. # In[13]: #The data is no cleaned and prepared for visualisations within Power BI. #Exporting the data from python as a excel file. Airbnb_Df.to_excel(Inpath + 'Listings Power BI Assignmnet_Cleaned.xlsx', index=True)