-
Notifications
You must be signed in to change notification settings - Fork 2
/
pearson.py
46 lines (35 loc) · 1.67 KB
/
pearson.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import pandas as pd
from cleaning import clean_data
def pearson_coeff(one_week: pd.Series, season_model: pd.Series) -> float:
'''
Computes the Pearson correlation coefficient between binned observations for one week of data
and corresponding seasonal data
## Parameters
* `one_week`: One week of data which has been aggregated into a 24h representation or similar
* `season_model`: Seasonal representation of the data with the same binning strategy as `one_week`
## Returns
The Pearson correlation coefficient between the week and seasonal observations
'''
a = (one_week-one_week.mean()) * (season_model-season_model.mean())
b = one_week.std() * season_model.std()
return a.mean() / b
def make_seasonal_models(df: pd.DataFrame) -> pd.Series:
'''
Creates seasonal median based models for use in the Pearson coefficient calculations.
## Parameters:
* `df`: Cleaned DataFrame with labeled seasonal data and hourly bike share counts
## Returns:
A Multi-Indexed (Season, Hour) Series with seasons and median data binned by the hour of the day (24 bins per season)
'''
seasons_df = df.groupby(['season', df.index.time]).median()
return seasons_df['count']
if __name__ == '__main__':
df = clean_data('app/data/clean_data.csv')
season_models = make_seasonal_models(df)
weeks = df[['count', 'season']].groupby(pd.Grouper(freq='1W'))
results = []
for (_, week) in weeks:
season = week['season'].value_counts().idxmax()
season_model = season_models[season]
week_model = week['count'].groupby(week.index.time).median()
results += [pearson_coeff(week_model, season_model)]