# All the modules we ever need for analysis see inline comments for more information


import dask  #For large scale data analysis
import dask.dataframe as dd #Provides pandas like evironment for large scale data
from dask.distributed import Client #A compute cluster to paralleize Tasks
import matplotlib.pyplot as plt #The Ubiquitous Matplotlib
import seaborn as sns #Seaborn: Just because it looks way cooler
import scipy.stats as stats #Scipy: For statsitical analysis
import swifter
import geopandas as gpd #Geopandas: For Geospatial Analysis
import itertools #For getting Combinations (Pairs) in an Array etc. 
import numpy as np #Accelrated Linear Algebra Computations
from IPython.display import HTML, Image #Display .png/.html file on the notebook


client = Client()
client


df = dd.read_csv('2017_*.csv', dtype={'Trip Seconds': 'float64'})


number_of_records = len(df)


print('There are {} records present'.format(number_of_records))


############INSERT EXPLANATION/TABLE AS TO WHICH COLUMNS WE CHOSE AND WHY############
df.describe().compute()


time_df = df[['Trip Seconds', 'Tips']]


correlation = time_df.corr()


correlation = correlation.compute()


corr_coef = correlation['Tips']['Trip Seconds']


corr_coef


outlier_tips[0].compute()


outlier_time = dask.array.percentile(time_df['Trip Seconds'].dropna().values, q = 97)
outlier_tips = dask.array.percentile(time_df['Tips'].dropna().values, q = 97)


filtered_time_df = time_df[(time_df['Trip Seconds'] <= outlier_time[0]) & (time_df['Tips'] <= outlier_tips[0])]


correlation_filtered = filtered_time_df.corr()


correlation_filtered = correlation_filtered.compute()
corr_coef = correlation_filtered['Tips']['Trip Seconds']


corr_coef


filtered_time_df_nonzeros = filtered_time_df[filtered_time_df['Tips'] != 0]
correlation_filtered = filtered_time_df_nonzeros.corr()
correlation_filtered = correlation_filtered.compute()
corr_coef = correlation_filtered['Tips']['Trip Seconds']


corr_coef


thousand_random = filtered_time_df_nonzeros.sample(frac = 4.0156697860121856e-05)
thousand_random = thousand_random.compute()


sns.regplot(x = 'Trip Seconds', y = 'Tips', data = thousand_random)
plt.title('Regressing Tips vs Trip Seconds on a Sample from Taxi Rides')
plt.show()


y_values = filtered_time_df_nonzeros['Tips'].dropna().to_dask_array()
sns.distplot(y_values, kde = True)
plt.title('Distribution of Tips')
plt.xlabel('Tip')
plt.ylabel('KDE density')
plt.show()


results = stats.normaltest(y_values)


results


y_values = filtered_time_df_nonzeros['Trip Seconds'].dropna().to_dask_array()
sns.distplot(y_values, kde = True)
plt.title('Distribution of Trip Times')
plt.xlabel('Trip Seconds')
plt.ylabel('KDE density')
plt.show()


results = stats.normaltest(y_values)


results


filtered_time_df_nonzeros_dropna = filtered_time_df_nonzeros.dropna()
slope, intercept, r_value, p_value, std_err = stats.linregress(filtered_time_df_nonzeros_dropna['Trip Seconds'],filtered_time_df_nonzeros_dropna['Tips'])


print('correlation coefficient is: {}'.format(r_value), 'p-value is: {}'.format(p_value))


results = stats.spearmanr(filtered_time_df_nonzeros_dropna['Trip Seconds'],filtered_time_df_nonzeros_dropna['Tips'])


results


company_df = df[['Tips', 'Company']]


numnulls = company_df['Company'].isna().sum().compute()
numnulls


unique_companies = company_df['Company'].unique().compute()


len(unique_companies)


company_df_nonnulltips = company_df.dropna()


proportion_of_trips = (company_df_nonnulltips['Company'].value_counts())/len(company_df_nonnulltips['Company'])


proportion_of_trips = proportion_of_trips.compute()


proportion_of_trips.hist()
plt.title('Histogram of the Proportion of Trips by Taxi Company')
plt.ylabel('Number of Companies')
plt.xlabel('Proportion of Trips')
plt.show()


proportion_of_trips.describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95, 0.98])


ninety_percentile = proportion_of_trips[proportion_of_trips > 3.809464e-02]


ninety_percentile.plot(kind = 'bar')
plt.title('Top Taxi Companies in Chicago')
plt.xlabel('Company Name')
plt.ylabel('Ride Share')
plt.show()


top_companies_tips = company_df[(company_df['Tips'] <= outlier_tips[0]) & (company_df['Company'].isin(ninety_percentile.index.tolist())) & (company_df['Tips'] != 0)]


categories = dict(zip(list(ninety_percentile.index), [i+1 for i in range(len(ninety_percentile))]))


top_companies_tips['Company Category'] = top_companies_tips['Company'].apply(lambda c: categories[c], meta = int)


import pandas as pd
pd.DataFrame.from_dict(categories, orient = 'index', columns = ['Category Number'])


plt.rcParams["figure.figsize"] = [16,9]
sns.violinplot(x="Company Category", y="Tips", data=top_companies_tips.compute())
plt.title('Distribution of Tips by Company')
plt.savefig('CATPLOT.png', dpi = 400)


for cat in categories.values():
    globals()['arr_{}'.format(cat)] = top_companies_tips[top_companies_tips['Company Category'] == cat].Tips


stats.kruskal(arr_1, arr_2, arr_3, arr_4, arr_5, arr_6, arr_7, arr_8, arr_9)


df_geographic = df[['Tips', 'Dropoff Census Tract']]
df_geographic = df_geographic.dropna(subset = ['Dropoff Census Tract'])


print('There are {} records left'.format(len(df_geographic)))


outlier_tips = dask.array.percentile(df['Tips'].dropna().values, q = 97)


filtered_geo_df = df_geographic[(df_geographic['Tips'] <= outlier_tips[0]) & (df_geographic['Tips'] != 0)]


print('There are {} records left'.format(len(filtered_geo_df)))


df_geographic = filtered_geo_df[['Tips', 'Dropoff Census Tract']]


sample = df_geographic.sample(frac = 12000/7310603)


sample_df = sample.compute()


shapefile_df = gpd.read_file('./shapefiles/cb_2017_17_tract_500k.shp')


shapefile_df['GEOID'] = shapefile_df['GEOID'].astype(float)


geo_df_merged = sample_df.merge(shapefile_df, how = 'inner', left_on='Dropoff Census Tract', right_on='GEOID')


geo_df_merged_shp = geo_df_merged[['Tips', 'GEOID', 'geometry']]


geo_df_merged_shp = gpd.GeoDataFrame(geo_df_merged_shp, crs=shapefile_df.crs, geometry='geometry')


plt.rcParams["figure.figsize"] = [16,9]
ax = plt.gca()
ax.grid(color='#b8b8b8', linestyle='--', linewidth=2, zorder=0)
ax.set_facecolor("#f2f2f2")
geo_df_merged_shp.plot(column = 'Tips', cmap='OrRd', legend=True, ax = ax)
plt.title('Degree of Spatial Clustering of Taxi Tips across Chicago', fontsize=15)
plt.xlabel('Longitude', fontsize=12)
plt.ylabel('Latitude',  fontsize=12)
plt.savefig('plotted.png', dpi = 300)
plt.close()


### We have manually added the anotation on the scale to make things a bit clear without cluttering things up.
Image('plotted.png')


geo_df_merged_shp.to_file('tipsfil.shp')

arcpy.stats.SpatialAutocorrelation("tipsfil", "Tips", "GENERATE_REPORT", "INVERSE_DISTANCE", "EUCLIDEAN_DISTANCE", "NONE", None, None)


import arcpy
arcpy.stats.HotSpots("tipsfil", "Tips", r"C:\Users\gani1\OneDrive\Documents\ArcGIS\Projects\COGS108\COGS108.gdb\tipsfil_HotSpots", "FIXED_DISTANCE_BAND", "EUCLIDEAN_DISTANCE", "NONE", None, None, None, "NO_FDR")


holiday_df = df[['Trip Start Timestamp', 'Tips']]


outlier_tips = dask.array.percentile(df['Tips'].dropna().values, q = 97)


filtered_holiday_df = holiday_df[(holiday_df['Tips'] <= outlier_tips[0]) & (holiday_df['Tips'] > 0)]
filtered_holiday_df = filtered_holiday_df.dropna()


tg_df = filtered_holiday_df.copy()
xmas_df = filtered_holiday_df.copy()
non_hol_df = filtered_holiday_df.copy()


def date_check_tg(x):
    x = x[:10]
    if x[:2] != "11":
        return np.nan
    elif 18 <= int(x[3:5]) <= 27:
        return x
    else:
        return np.nan
    

def non_tg_non_xmas(x):
    x = x[:10]
    if x[:2] != "11" or x[:2] != "12":
        return x
    elif 18 <= int(x[3:5]) <= 27:
        if x[:2] != '11':
            return x
    elif 20 <= int(x[3:5]) <= 31:
        if x[:2] != '12':
            return x
    else:
        return np.nan
    
def date_check_xmas(x):
    x = x[:10]
    if x[:2] != "12":
        return np.nan
    elif 20 <= int(x[3:5]) <= 31:
        return x
    else:
        return np.nan
    
tg_df["Trip Start Timestamp"] = tg_df["Trip Start Timestamp"].apply(date_check_tg, meta=('Trip Start Timestamp', 'object'))
non_hol_df["Trip Start Timestamp"] = non_hol_df["Trip Start Timestamp"].apply(non_tg_non_xmas, meta=('Trip Start Timestamp', 'object'))
xmas_df["Trip Start Timestamp"] = xmas_df["Trip Start Timestamp"].apply(date_check_xmas, meta=('Trip Start Timestamp', 'object'))


tg_df = tg_df.dropna()
non_hol_df = non_hol_df.dropna()
xmas_df = xmas_df.dropna()


xmas_df['season'] = 'xmas'
non_hol_df['season'] = 'non-holiday'
tg_df['season'] = 'thankgiving'


all_data_with_season = dd.concat([xmas_df, non_hol_df])


all_data_with_thanksgiving = dd.concat([tg_df, non_hol_df])


stats.kruskal(tg_df.Tips, non_hol_df.Tips)


sns.violinplot(x = 'season', y = 'Tips', data = all_data_with_thanksgiving.compute())
plt.savefig('thanksgiving.png')
plt.close()


stats.kruskal(xmas_df.Tips, non_hol_df.Tips)


sns.violinplot(x = 'season', y = 'Tips', data = all_data_with_season.compute())
plt.savefig('xmas.png')
plt.close()

Factors influencing Taxi Driver Tips¶

An Investigative Study by Kaushik G, Evan P, Matthew K, Cherrise M¶

Part 0 (Module Imports and Computing basic statistics on columns of the dataset)¶

Displaying and Selecting Appropriate columns for the Data Analysis¶

Part 1. Do passengers who travel for longer periods of time in a cab tend to tip more?¶

This shows that either variable is not normally distributed, thus hindering our ability to validate our strong linear correlation between Tips and Trip Duration.¶

Despite failing one of the requirements, let us try to see what the lienar regression report says:¶

Results¶

Part 2. Do drivers in certain companies recieve more tips?¶

Results¶

Part 3. Geographical differences in Taxi Driver Tips¶

Output¶

Results¶

Part 4. Are there any Seasonal (Temporal) Differences to Taxi Tips?¶

Results¶

Conclusions and Discussion¶

Ethics & Privacy:¶