import pandas as pd import matplotlib.pyplot as plt import numpy as np import datetime as dt from datetime import datetime path_to_data = "" # specify your path here frame = pd.read_csv(path_to_data) frame["created_at"] = pd.to_datetime(frame["created_at"]) frame = frame.set_index("created_at") frame = frame.sort_index() user_start_date = datetime.strptime("2021-10-01 16:51:08", '%Y-%m-%d %H:%M:%S') last_observed_event = np.max(frame.index) lookforward_window = dt.timedelta(weeks=28) # half a year user_end_date = datetime.strptime("2022-06-27 16:51:08", '%Y-%m-%d %H:%M:%S') retention_frequency = 28 #days total_val = frame.reset_index().values used = {} retention_periods = [{} for i in range(10000)] # 10k is random large enough withdrawal_cnt = 0 for event in total_val: if event[1] not in used.keys(): used[event[1]] = event[0] first_action_time = used[event[1]] current_action_time = event[0] # if user not from the right window if first_action_time.replace(tzinfo=None) < user_start_date.replace(tzinfo=None) or \ first_action_time.replace(tzinfo=None) > user_end_date.replace(tzinfo=None): continue # if event is further than lookforward_window ahead if current_action_time.replace(tzinfo=None) > \ first_action_time.replace(tzinfo=None) + lookforward_window: continue if(event[2] == "withdraw"): withdrawal_cnt += 1 continue time_delta = pd.Timestamp(current_action_time) - pd.Timestamp(first_action_time) period = time_delta.days // retention_frequency retention_periods[period][event[1]] = 1 retention_abs = [len(x) for x in retention_periods] i = len(retention_abs) - 1 while retention_abs[i] == 0: i -= 1 retention_abs = np.array(retention_abs[:i+1]) xticks = np.arange(len(retention_abs)).astype(int) * retention_frequency plt.figure(figsize=(12,6)) plt.xlabel("# of days, retention period is {} days".format(retention_frequency)) plt.ylabel("# of users") plt.grid(True) plt.ylim([0, 8500]) plt.title("RETENTION\n\ start_user_first_date={}\nend_user_first_date={}".format(\ user_start_date, user_end_date.replace(tzinfo=None))) plt.plot(xticks, retention_abs) plt.show()