How to generate sample data for Automated Feature Engineering via Featuretools post

Introduction: Data Creation

In this notebook we create an example dataset to be used for automated feature engineering. I have included this code in the repository for posterity and because at some point it may come in use for generating additional example datasets.

In [1]:
import pandas as pd
import numpy as np

from datetime import datetime
import random

rand_dates = []
 
for _ in range(1000):
  
  year = random.choice(range(2000, 2015))
  month = random.choice(range(1, 13))
  day = random.choice(range(1, 29))
  rdate = datetime(year, month, day)
  rand_dates.append(rdate)
In [2]:
clients = pd.DataFrame(columns = ['client_id', 'joined', 'income', 'credit_score'])
for _ in range(25):
  clients = clients.append(pd.DataFrame({'client_id': np.random.randint(25000, 50000, size = 1)[0], 'joined': random.choice(rand_dates),
                           'income': np.random.randint(30500, 240000, size = 1)[0], 'credit_score': np.random.randint(500, 850, size = 1)[0]},
                                        index = [0]), ignore_index = True)

clients.head()
Out[2]:
client_id joined income credit_score
0 46109 2002-04-16 172677 527
1 49545 2007-11-14 104564 770
2 41480 2013-03-11 122607 585
3 46180 2001-11-06 43851 562
4 25707 2006-10-06 211422 621
In [3]:
loans = pd.DataFrame(columns = ['client_id', 'loan_type', 'loan_amount', 'repaid',
                                         'loan_id', 'loan_start', 'loan_end', 'rate'])

for client in clients['client_id'].unique():
  for _ in range(20):
    time_created = pd.datetime(np.random.randint(2000, 2015, size = 1)[0],
                               np.random.randint(1, 13, size = 1)[0],
                               np.random.randint(1, 30, size = 1)[0])

    time_ended = time_created + pd.Timedelta(days = np.random.randint(500, 1000, size = 1)[0])

    loans = loans.append(pd.DataFrame({'client_id': client, 'loan_type': random.choice(['cash', 'credit', 'home', 'other']),
                                                         'loan_amount': np.random.randint(500, 15000, size = 1)[0],
                                                         'repaid': random.choice([0, 1]), 
                                                         'loan_id': np.random.randint(10000, 12000, size = 1)[0],
                                                         'loan_start': time_created,
                                                         'loan_end': time_ended,
                                                          'rate': round(abs(4 * np.random.randn(1)[0]), 2)}, index = [0]), ignore_index = True)
In [4]:
loans.head()
Out[4]:
client_id loan_type loan_amount repaid loan_id loan_start loan_end rate
0 46109 home 13672 0 10243 2002-04-16 2003-12-20 2.15
1 46109 credit 9794 0 10984 2003-10-21 2005-07-17 1.25
2 46109 home 12734 1 10990 2006-02-01 2007-07-05 0.68
3 46109 cash 12518 1 10596 2010-12-08 2013-05-05 1.24
4 46109 credit 14049 1 11415 2010-07-07 2012-05-21 3.13
In [5]:
payments = pd.DataFrame(columns = ['loan_id', 'payment_amount', 
                                    'payment_date', 'missed'])

for _, row in loans.iterrows():
  time_created = row['loan_start']
  payment_date = time_created + pd.Timedelta(days = 30)
  loan_amount = row['loan_amount']
  loan_id = row['loan_id']
  payment_id = np.random.randint(10000, 12000, size = 1)[0]
  for _ in range(np.random.randint(5, 10, size = 1)[0]):
    payment_id += 1
    payment_date += pd.Timedelta(days = np.random.randint(10, 50, size = 1)[0])
    payments = payments.append(pd.DataFrame({'loan_id': loan_id, 
                                                               'payment_amount': np.random.randint(int(loan_amount / 10), int(loan_amount / 5), size = 1)[0],
                                                               'payment_date': payment_date, 'missed': random.choice([0, 1])}, index = [0]), ignore_index = True)
    
In [6]:
payments.head()
Out[6]:
loan_id payment_amount payment_date missed
0 10243 2369 2002-05-31 1
1 10243 2439 2002-06-18 1
2 10243 2662 2002-06-29 0
3 10243 2268 2002-07-20 0
4 10243 2027 2002-07-31 1
In [7]:
clients = clients.drop_duplicates(subset = 'client_id')
loans = loans.drop_duplicates(subset = 'loan_id')


clients.to_csv('clients.csv', index = False)
loans.to_csv('loans.csv', index = False)
payments.to_csv('payments.csv', index = False)