{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "1Bs1innK5QPb"
},
"source": [
"# Chapter 9 - Data Science"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#To change date format\n",
"#Amercian date format is mm/dd/yyyy. I have changed it.\n",
"#I have made the negative campaign duration to postitive.\n",
"#And I have added the day of the weeks."
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "5pNpI9xW5QPc"
},
"source": [
"## 0 - Setting up the notebook"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {
"colab": {
"base_uri": "https:
localhost:8080/",
"height": 406
},
"colab_type": "code",
"id": "Mp3qnAoS5QPe",
"outputId": "5ac5685b-b5ff-4635-dc
-7986b1e3d1cb"
},
"outputs": [],
"source": [
"#!pip install faker\n",
"#!pip install delorean\n",
"\n",
"import json\n",
"import calendar\n",
"import random\n",
"from datetime import date, timedelta, datetime\n",
"import faker\n",
"import numpy as np\n",
"from pandas import DataFrame\n",
"from delorean import parse\n",
"import pandas as pd\n",
"\n",
"# make the graphs nicer\n",
"import matplotlib.pyplot as plt\n",
"plt.style.use('ggplot')\n",
"# see all available with: print(plt.style.available)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "wegaBXNB5QPg"
},
"source": [
"## 1 - Preparing the Data"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "s_DxNSf55QPh"
},
"outputs": [],
"source": [
"# create the faker to populate the data\n",
"fake = faker.Faker()"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "irfx4AJr5QPj"
},
"outputs": [],
"source": [
"usernames = set()\n",
"usernames_no = 1000\n",
"\n",
"# populate the set with 1000 unique usernames\n",
"while len(usernames) < usernames_no:\n",
" usernames.add(fake.user_name())"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {
"colab": {
"base_uri": "https:
localhost:8080/",
"height": 87
},
"colab_type": "code",
"id": "_dtbclPG5QPm",
"outputId": "6315de4b-14a2-4927-ae17-b6db51390ff9"
},
"outputs": [
{
"data": {
"text/plain": [
"['{\"username\": \"ashley97\", \"name\": \"Miranda Co
\", \"gender\": \"F\", \"email\": \"
[email protected]\", \"age\": 35, \"address\": \"00037 Bowers Mall Apt. 791\\\\nHuangfurt, CA 81423\"}',\n",
" '{\"username\": \"lstewart\", \"name\": \"Julie Ha
is\", \"gender\": \"F\", \"email\": \"
[email protected]\", \"age\": 83, \"address\": \"83858 Jones Streets Suite 212\\\\nSarahburgh, SD 84019\"}',\n",
" '{\"username\": \"sharon85\", \"name\": \"Christopher Blevins\", \"gender\": \"M\", \"email\": \"
[email protected]\", \"age\": 62, \"address\": \"1266 Alice Pike\\\\nNew Jay, MI 30538\"}']"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def get_random_name_and_gender():\n",
" skew = .6 # 60% of users will be female\n",
" male = random.random() > skew\n",
" if male:\n",
" return fake.name_male(), 'M'\n",
" else:\n",
" return fake.name_female(), 'F'\n",
"\n",
"# for each username, create a complete user profile\n",
"# simulate user data coming from an API. It is a list\n",
"# of JSON strings (users).\n",
"def get_users(usernames):\n",
" users = []\n",
" for username in usernames:\n",
" name, gender = get_random_name_and_gender()\n",
" user = {\n",
" 'username': username,\n",
" 'name': name,\n",
" 'gender': gender,\n",
" 'email': fake.email(),\n",
" 'age': fake.random_int(min=18, max=90),\n",
" 'address': fake.address(),\n",
" }\n",
" users.append(json.dumps(user))\n",
" return users\n",
"\n",
"users = get_users(usernames)\n",
"users[:3]"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "3cEb9cLG5QPo"
},
"outputs": [],
"source": [
"# campaign name format:\n",
"# InternalType_StartDate_EndDate_TargetAge_TargetGender_Cu
ency\n",
"def get_type():\n",
" # just some gi
erish internal codes\n",
" types = ['AKX', 'BYU', 'GRZ', 'KTR']\n",
" return random.choice(types)\n",
"\n",
"def get_start_end_dates():\n",
" duration = random.randint(1, 2 * 365)\n",
" offset = random.randint(-365, 365)\n",
" start = date.today() - timedelta(days=offset)\n",
" end = start + timedelta(days=duration)\n",
" \n",
" def _format_date(date_):\n",
" return date_.strftime(\"%Y%m%d\") \n",
" \n",
" return _format_date(start), _format_date(end)\n",
"\n",
"def get_age():\n",
" age = random.randint(20, 45)\n",
" age -= age % 5\n",
" diff = random.randint(5, 25)\n",
" diff -= diff % 5\n",
" return '{}-{}'.format(age, age + diff)\n",
"\n",
"def get_gender():\n",
" return random.choice(('M', 'F', 'B'))\n",
"\n",
"def get_cu
ency():\n",
" return random.choice(('GBP', 'EUR', 'USD'))\n",
"\n",
"def get_campaign_name():\n",
" separator = '_'\n",
" type_ = get_type()\n",
" start_end = separator.join(get_start_end_dates())\n",
" age = get_age()\n",
" gender = get_gender()\n",
" cu
ency = get_cu
ency()\n",
" return separator.join(\n",
" (type_, start_end, age, gender, cu
ency))"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "ehc8mWDw5QPq"
},
"outputs": [],
"source": [
"# campaign data:\n",
"# name, budget, spent, clicks, impressions\n",
"def get_campaign_data():\n",
" name = get_campaign_name()\n",
" budget = random.randint(10**3, 10**6)\n",
" spent = random.randint(10**2, budget) \n",
" clicks = int(random.triangular(10**2, 10**5, 0.2 * 10**5)) \n",
" impressions = int(random.gauss(0.5 * 10**6, 2))\n",
" return {\n",
" 'cmp_name': name,\n",
" 'cmp_bgt': budget,\n",
" 'cmp_spent': spent,\n",
" 'cmp_clicks': clicks,\n",
" 'cmp_impr': impressions\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "FvtOrJPw5QPs"
},
"outputs": [],
"source": [
"# assemble the logic to get the final version of the rough data\n",
"# data will be a list of dictionaries. Each dictionary will follow\n",
"# this structure:\n",
"# {'user': user_json, 'campaigns': [c1, c2, ...]}\n",
"# where user_json is the JSON string version of a user data dict\n",
"# and c1, c2, ... are campaign dicts as returned by\n",
"# get_campaign_data\n",
"\n",
"def get_data(users):\n",
" data = []\n",
" for user in users:\n",
" campaigns = [get_campaign_data()\n",
" for _ in range(random.randint(2, 8))]\n",
" data.append({'user': user, 'campaigns': campaigns})\n",
" return data"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Ux9VN0Q15QPu"
},
"source": [
"## 2 - Cleaning the data"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {
"colab": {
"base_uri": "https:
localhost:8080/",
"height": 994
},
"colab_type": "code",
"id": "7kxPpJCh5QPv",
"outputId": "d0e63299-c882-42ea-9b30-23a6e8b5a63e"
},
"outputs": [
{
"data": {
"text/plain": [
"[{'user': '{\"username\": \"ashley97\", \"name\": \"Miranda Co
\", \"gender\": \"F\", \"email\": \"
[email protected]\", \"age\": 35, \"address\": \"00037 Bowers Mall Apt. 791\\\\nHuangfurt, CA 81423\"}',\n",
" 'campaigns': [{'cmp_name': 'KTR_20191001_20210104_30-45_B_EUR',\n",
" 'cmp_bgt': 542731,\n",
" 'cmp_spent': 320061,\n",
" 'cmp_clicks': 56556,\n",
" 'cmp_impr': 499998},\n",
" {'cmp_name': 'KTR_20181119_20190516_40-60_B_EUR',\n",
" 'cmp_bgt': 563162,\n",
" 'cmp_spent': 294891,\n",
" 'cmp_clicks': 66268,\n",
" 'cmp_impr': 500001}]},\n",
" {'user': '{\"username\": \"lstewart\", \"name\": \"Julie Ha
is\", \"gender\": \"F\", \"email\": \"
[email protected]\", \"age\": 83, \"address\": \"83858 Jones Streets Suite 212\\\\nSarahburgh, SD 84019\"}',\n",
" 'campaigns': [{'cmp_name': 'KTR_20190603_20200807_35-45_M_EUR',\n",
" 'cmp_bgt': 354212,\n",
" 'cmp_spent': 30657,\n",
" 'cmp_clicks': 29316,\n",
" 'cmp_impr': 500001},\n",
" {'cmp_name': 'GRZ_20191119_20210913_40-55_B_GBP',\n",
" 'cmp_bgt': 994364,\n",
" 'cmp_spent': 122783,\n",
" 'cmp_clicks': 63457,\n",
" 'cmp_impr': 499999}]}]"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# fetch simulated rough data\n",
"rough_data = get_data(users)\n",
"\n",
"rough_data[:2] # let's take a peek"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {
"colab": {
"base_uri": "https:
localhost:8080/",
"height": 238
},
"colab_type": "code",
"id": "Bf6rqL-d5QPx",
"outputId": "9c0710be-c3e8-4f5b-c8c9-1df0508396ed"
},
"outputs": [
{
"data": {
"text/plain": [
"[{'cmp_name': 'KTR_20191001_20210104_30-45_B_EUR',\n",
" 'cmp_bgt': 542731,\n",
" 'cmp_spent': 320061,\n",
" 'cmp_clicks': 56556,\n",
" 'cmp_impr': 499998,\n",
" 'user': '{\"username\": \"ashley97\", \"name\": \"Miranda Co
\", \"gender\": \"F\", \"email\": \"
[email protected]\", \"age\": 35, \"address\": \"00037 Bowers Mall Apt. 791\\\\nHuangfurt, CA 81423\"}'},\n",
" {'cmp_name': 'KTR_20181119_20190516_40-60_B_EUR',\n",
" 'cmp_bgt': 563162,\n",
" 'cmp_spent': 294891,\n",
" 'cmp_clicks': 66268,\n",
" 'cmp_impr': 500001,\n",
" 'user': '{\"username\": \"ashley97\", \"name\": \"Miranda Co
\", \"gender\": \"F\", \"email\": \"
[email protected]\", \"age\": 35, \"address\": \"00037 Bowers Mall Apt. 791\\\\nHuangfurt, CA 81423\"}'}]"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Let's start from having a different version of the data\n",
"# I want a list whose items will be dicts. Each dict is \n",
"# the original campaign dict plus the user JSON\n",
"\n",
"data = []\n",
"for datum in rough_data:\n",
" for campaign in datum['campaigns']:\n",
" campaign.update({'user': datum['user']})\n",
" data.append(campaign)\n",
"data[:2] # let's take another peek"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "25HN96S65QP0"
},
"source": [
"### Creating the DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {
"colab": {
"base_uri": "https:
localhost:8080/",
"height": 195
},
"colab_type": "code",
"id": "g-te8Uy85QP1",
"outputId": "82764019-7e45-4384-e30a-136e3468b786"
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"