Introduction: Data Creation¶

In this notebook we create an example dataset to be used for automated feature engineering. I have included this code in the repository for posterity and because at some point it may come in use for generating additional example datasets.

In [1]:

import pandas as pd
import numpy as np

from datetime import datetime
import random

rand_dates = []
 
for _ in range(1000):
  
  year = random.choice(range(2000, 2015))
  month = random.choice(range(1, 13))
  day = random.choice(range(1, 29))
  rdate = datetime(year, month, day)
  rand_dates.append(rdate)

In [2]:

clients = pd.DataFrame(columns = ['client_id', 'joined', 'income', 'credit_score'])
for _ in range(25):
  clients = clients.append(pd.DataFrame({'client_id': np.random.randint(25000, 50000, size = 1)[0], 'joined': random.choice(rand_dates),
                           'income': np.random.randint(30500, 240000, size = 1)[0], 'credit_score': np.random.randint(500, 850, size = 1)[0]},
                                        index = [0]), ignore_index = True)

clients.head()

Out[2]:

	client_id	joined	income	credit_score
0	46109	2002-04-16	172677	527
1	49545	2007-11-14	104564	770
2	41480	2013-03-11	122607	585
3	46180	2001-11-06	43851	562
4	25707	2006-10-06	211422	621

In [3]:

loans = pd.DataFrame(columns = ['client_id', 'loan_type', 'loan_amount', 'repaid',
                                         'loan_id', 'loan_start', 'loan_end', 'rate'])

for client in clients['client_id'].unique():
  for _ in range(20):
    time_created = pd.datetime(np.random.randint(2000, 2015, size = 1)[0],
                               np.random.randint(1, 13, size = 1)[0],
                               np.random.randint(1, 30, size = 1)[0])

    time_ended = time_created + pd.Timedelta(days = np.random.randint(500, 1000, size = 1)[0])

    loans = loans.append(pd.DataFrame({'client_id': client, 'loan_type': random.choice(['cash', 'credit', 'home', 'other']),
                                                         'loan_amount': np.random.randint(500, 15000, size = 1)[0],
                                                         'repaid': random.choice([0, 1]), 
                                                         'loan_id': np.random.randint(10000, 12000, size = 1)[0],
                                                         'loan_start': time_created,
                                                         'loan_end': time_ended,
                                                          'rate': round(abs(4 * np.random.randn(1)[0]), 2)}, index = [0]), ignore_index = True)

In [4]:

loans.head()

Out[4]:

	client_id	loan_type	loan_amount	repaid	loan_id	loan_start	loan_end	rate
0	46109	home	13672	0	10243	2002-04-16	2003-12-20	2.15
1	46109	credit	9794	0	10984	2003-10-21	2005-07-17	1.25
2	46109	home	12734	1	10990	2006-02-01	2007-07-05	0.68
3	46109	cash	12518	1	10596	2010-12-08	2013-05-05	1.24
4	46109	credit	14049	1	11415	2010-07-07	2012-05-21	3.13

In [5]:

payments = pd.DataFrame(columns = ['loan_id', 'payment_amount', 
                                    'payment_date', 'missed'])

for _, row in loans.iterrows():
  time_created = row['loan_start']
  payment_date = time_created + pd.Timedelta(days = 30)
  loan_amount = row['loan_amount']
  loan_id = row['loan_id']
  payment_id = np.random.randint(10000, 12000, size = 1)[0]
  for _ in range(np.random.randint(5, 10, size = 1)[0]):
    payment_id += 1
    payment_date += pd.Timedelta(days = np.random.randint(10, 50, size = 1)[0])
    payments = payments.append(pd.DataFrame({'loan_id': loan_id, 
                                                               'payment_amount': np.random.randint(int(loan_amount / 10), int(loan_amount / 5), size = 1)[0],
                                                               'payment_date': payment_date, 'missed': random.choice([0, 1])}, index = [0]), ignore_index = True)

In [6]:

payments.head()

Out[6]:

	loan_id	payment_amount	payment_date	missed
0	10243	2369	2002-05-31	1
1	10243	2439	2002-06-18	1
2	10243	2662	2002-06-29	0
3	10243	2268	2002-07-20	0
4	10243	2027	2002-07-31	1

In [7]:

clients = clients.drop_duplicates(subset = 'client_id')
loans = loans.drop_duplicates(subset = 'loan_id')


clients.to_csv('clients.csv', index = False)
loans.to_csv('loans.csv', index = False)
payments.to_csv('payments.csv', index = False)

Introduction: Data Creation¶

Related Posts: