Codecademy - Pandas Lesson - Teddy's online desktop

You’re getting ready to staff the clinic for March this year. You want to know how many visits took place in March last year, to help you prepare.

Write a command that will produce a Series made up of the March data from df from all four clinic sites and save it to the variable march.

#import     /;../,codecademylib3

import pandas as pd

df = pd.DataFrame([
  ['January', 100, 100, 23, 100],
  ['February', 51, 45, 145, 45],
  ['March', 81, 96, 65, 96],
  ['April', 80, 80, 54, 180],
  ['May', 51, 54, 54, 154],
  ['June', 112, 109, 79, 129]],
  columns=['month', 'clinic_east',
           'clinic_north', 'clinic_south',
           'clinic_west'])
           
print(df)

##       month  clinic_east  clinic_north  clinic_south  clinic_west
## 0   January          100           100            23          100
## 1  February           51            45           145           45
## 2     March           81            96            65           96
## 3     April           80            80            54          180
## 4       May           51            54            54          154
## 5      June          112           109            79          129

# integer location within dataframe 
# locations are zero indexed and doesn't include the ending integer

march = df.iloc[2]
print(march)

## month           March
## clinic_east        81
## clinic_north       96
## clinic_south       65
## clinic_west        96
## Name: 2, dtype: object

april_may_june = df.iloc[3:6]
print(april_may_june)

##    month  clinic_east  clinic_north  clinic_south  clinic_west
## 3  April           80            80            54          180
## 4    May           51            54            54          154
## 5   June          112           109            79          129

You can select a subset of the a DataFrame by using logical statements and isin method.

january = df[df.month == 'January']
print(january)

##      month  clinic_east  clinic_north  clinic_south  clinic_west
## 0  January          100           100            23          100

march_april = df[(df.month == 'March') | (df.month == 'April')]

print(march_april)

##    month  clinic_east  clinic_north  clinic_south  clinic_west
## 2  March           81            96            65           96
## 3  April           80            80            54          180

january_february_march = df[df.month.isin(['January', 'February', 'March'])]

print(january_february_march)

##       month  clinic_east  clinic_north  clinic_south  clinic_west
## 0   January          100           100            23          100
## 1  February           51            45           145           45
## 2     March           81            96            65           96

Reset indices

df2 = df.loc[[1, 3, 5]]

print(df2)

##       month  clinic_east  clinic_north  clinic_south  clinic_west
## 1  February           51            45           145           45
## 3     April           80            80            54          180
## 5      June          112           109            79          129

df3 = df2.reset_index()

print(df3)

##    index     month  clinic_east  clinic_north  clinic_south  clinic_west
## 0      1  February           51            45           145           45
## 1      3     April           80            80            54          180
## 2      5      June          112           109            79          129

df2.reset_index(inplace = True, drop = True)

print(df2)

##       month  clinic_east  clinic_north  clinic_south  clinic_west
## 0  February           51            45           145           45
## 1     April           80            80            54          180
## 2      June          112           109            79          129

Adding a column to the dataframe

df = pd.DataFrame([
  [1, '3 inch screw', 0.5, 0.75],
  [2, '2 inch nail', 0.10, 0.25],
  [3, 'hammer', 3.00, 5.50],
  [4, 'screwdriver', 2.50, 3.00]
],
  columns=['Product ID', 'Description', 'Cost to Manufacture', 'Price']
)

df['Sold in Bulk?'] = ['Yes','Yes','No','No']

df['In Stock?'] = True

df['Margin'] = df.Price - df['Cost to Manufacture']

print(df)

##    Product ID   Description  ...  In Stock?  Margin
## 0           1  3 inch screw  ...       True    0.25
## 1           2   2 inch nail  ...       True    0.15
## 2           3        hammer  ...       True    2.50
## 3           4   screwdriver  ...       True    0.50
## 
## [4 rows x 7 columns]

Applying functions to dataframe

df = pd.DataFrame([
  ['JOHN SMITH', 'john.smith@gmail.com'],
  ['Jane Doe', 'jdoe@yahoo.com'],
  ['joe schmo', 'joeschmo@hotmail.com']
],
columns=['Name', 'Email'])

df['Lowercase Name'] = df.Name.apply(str.lower)

print(df)

#prints the python code for the dataframe

##          Name                 Email Lowercase Name
## 0  JOHN SMITH  john.smith@gmail.com     john smith
## 1    Jane Doe        jdoe@yahoo.com       jane doe
## 2   joe schmo  joeschmo@hotmail.com      joe schmo

print(f'df = pd.DataFrame( {str(df.to_dict())} )')

## df = pd.DataFrame( {'Name': {0: 'JOHN SMITH', 1: 'Jane Doe', 2: 'joe schmo'}, 'Email': {0: 'john.smith@gmail.com', 1: 'jdoe@yahoo.com', 2: 'joeschmo@hotmail.com'}, 'Lowercase Name': {0: 'john smith', 1: 'jane doe', 2: 'joe schmo'}} )

Applying lambda functions

mylambda = lambda x: (x[0]+x[-1])

mylambda = lambda x: 'Welcome to BattleCity!' if x >= 13 else 'You must be over 13'

print(mylambda(14))

## Welcome to BattleCity!

df = pd.DataFrame( {'id': {0: 10310, 1: 18656, 2: 61254, 3: 16886, 4: 89010, 5: 87246, 6: 20578, 7: 12869, 8: 53461, 9: 14746, 10: 71127, 11: 92522, 12: 22447, 13: 61654, 14: 16988, 15: 68619, 16: 59949, 17: 81418, 18: 27267, 19: 19985}, 'name': {0: 'Lauren Durham', 1: 'Grace Sellers', 2: 'Shirley Rasmussen', 3: 'Brian Rojas', 4: 'Samantha Mosley', 5: 'Louis Guzman', 6: 'Denise Mcclure', 7: 'James Raymond', 8: 'Noah Collier', 9: 'Donna Frederick', 10: 'Shirley Beck', 11: 'Christina Kelly', 12: 'Brian Noble', 13: 'Randy Key', 14: 'Diana Stewart', 15: 'Timothy Sosa', 16: 'Betty Skinner', 17: 'Janet Maxwell', 18: 'Madison Johnston', 19: 'Virginia Nichols'}, 'hourly_wage': {0: 19, 1: 17, 2: 16, 3: 18, 4: 11, 5: 14, 6: 15, 7: 15, 8: 18, 9: 20, 10: 14, 11: 8, 12: 11, 13: 16, 14: 14, 15: 14, 16: 11, 17: 12, 18: 20, 19: 13}, 'hours_worked': {0: 43, 1: 40, 2: 30, 3: 47, 4: 38, 5: 39, 6: 40, 7: 32, 8: 35, 9: 41, 10: 32, 11: 44, 12: 39, 13: 38, 14: 48, 15: 42, 16: 48, 17: 38, 18: 37, 19: 49}, 'total_earned': {0: 845.5, 1: 680.0, 2: 480.0, 3: 909.0, 4: 418.0, 5: 546.0, 6: 600.0, 7: 480.0, 8: 630.0, 9: 830.0, 10: 448.0, 11: 368.0, 12: 429.0, 13: 608.0, 14: 728.0, 15: 602.0, 16: 572.0, 17: 456.0, 18: 740.0, 19: 695.5}} )


get_last_name = lambda x: x.split(' ')[-1]

df['last_name'] = df.name.apply(get_last_name)

total_earned = lambda row: (40 * row.hourly_wage) + ((row.hours_worked - 40) * 1.5 * row.hourly_wage) \
if row.hours_worked > 40 \
else (row.hours_worked * row.hourly_wage)

df['total_earned'] = df.apply(total_earned, axis =1)

print(df)

##        id               name  ...  total_earned  last_name
## 0   10310      Lauren Durham  ...         845.5     Durham
## 1   18656      Grace Sellers  ...         680.0    Sellers
## 2   61254  Shirley Rasmussen  ...         480.0  Rasmussen
## 3   16886        Brian Rojas  ...         909.0      Rojas
## 4   89010    Samantha Mosley  ...         418.0     Mosley
## 5   87246       Louis Guzman  ...         546.0     Guzman
## 6   20578     Denise Mcclure  ...         600.0    Mcclure
## 7   12869      James Raymond  ...         480.0    Raymond
## 8   53461       Noah Collier  ...         630.0    Collier
## 9   14746    Donna Frederick  ...         830.0  Frederick
## 10  71127       Shirley Beck  ...         448.0       Beck
## 11  92522    Christina Kelly  ...         368.0      Kelly
## 12  22447        Brian Noble  ...         429.0      Noble
## 13  61654          Randy Key  ...         608.0        Key
## 14  16988      Diana Stewart  ...         728.0    Stewart
## 15  68619       Timothy Sosa  ...         602.0       Sosa
## 16  59949      Betty Skinner  ...         572.0    Skinner
## 17  81418      Janet Maxwell  ...         456.0    Maxwell
## 18  27267   Madison Johnston  ...         740.0   Johnston
## 19  19985   Virginia Nichols  ...         695.5    Nichols
## 
## [20 rows x 6 columns]

Renaming columns

df = pd.DataFrame({
    'name': ['John', 'Jane', 'Sue', 'Fred'],
    'age': [23, 29, 21, 18]
})
df.columns = ['First Name', 'Age']

#or this method if you only want to rename one or more columns

df.rename(columns={
    'name': 'First Name',
    'age': 'Age'},
    inplace=True)

print(df)

##   First Name  Age
## 0       John   23
## 1       Jane   29
## 2        Sue   21
## 3       Fred   18


#command to print dataframe data as a dictionary so I can copy the data from codecademy

#print(f'orders = pd.DataFrame( {str(orders.to_dict())} )')

orders = pd.DataFrame( {'id': {0: 54791, 1: 53450, 2: 91987, 3: 14437, 4: 79357, 5: 52386, 6: 20487, 7: 76971, 8: 21586, 9: 62083, 10: 91629, 11: 98602, 12: 45832, 13: 33862, 14: 73431, 15: 93889, 16: 39888, 17: 35961, 18: 24560, 19: 28559}, 'first_name': {0: 'Rebecca', 1: 'Emily', 2: 'Joyce', 3: 'Justin', 4: 'Andrew', 5: 'Julie', 6: 'Thomas', 7: 'Janice', 8: 'Gabriel', 9: 'Frances', 10: 'Jessica', 11: 'Lawrence', 12: 'Susan', 13: 'Diane', 14: 'Rebecca', 15: 'Jacqueline', 16: 'Vincent', 17: 'Roy', 18: 'Thomas', 19: 'Angela'}, 'last_name': {0: 'Lindsay', 1: 'Joyce', 2: 'Waller', 3: 'Erickson', 4: 'Banks', 5: 'Marsh', 6: 'Jensen', 7: 'Hicks', 8: 'Porter', 9: 'Palmer', 10: 'Hale', 11: 'Parker', 12: 'Dennis', 13: 'Ochoa', 14: 'Charles', 15: 'Crane', 16: 'Stephenson', 17: 'Tillman', 18: 'Roberson', 19: 'Newton'}, 'gender': {0: 'female', 1: 'female', 2: 'female', 3: 'male', 4: 'male', 5: 'female', 6: 'male', 7: 'female', 8: 'male', 9: 'female', 10: 'female', 11: 'male', 12: 'female', 13: 'female', 14: 'female', 15: 'female', 16: 'male', 17: 'male', 18: 'male', 19: 'female'}, 'email': {0: 'RebeccaLindsay57@hotmail.com', 1: 'EmilyJoyce25@gmail.com', 2: 'Joyce.Waller@gmail.com', 3: 'Justin.Erickson@outlook.com', 4: 'AB4318@gmail.com', 5: 'JulieMarsh59@gmail.com', 6: 'TJ5470@gmail.com', 7: 'Janice.Hicks@gmail.com', 8: 'GabrielPorter24@gmail.com', 9: 'FrancesPalmer50@gmail.com', 10: 'JessicaHale25@gmail.com', 11: 'LawrenceParker44@gmail.com', 12: 'SusanDennis58@gmail.com', 13: 'DO2680@gmail.com', 14: 'Rebecca.Charles@gmail.com', 15: 'JC2072@hotmail.com', 16: 'VS4753@outlook.com', 17: 'RoyTillman20@gmail.com', 18: 'Thomas.Roberson@gmail.com', 19: 'ANewton1977@outlook.com'}, 'shoe_type': {0: 'clogs', 1: 'ballet flats', 2: 'sandles', 3: 'clogs', 4: 'boots', 5: 'sandles', 6: 'clogs', 7: 'clogs', 8: 'clogs', 9: 'wedges', 10: 'clogs', 11: 'wedges', 12: 'ballet flats', 13: 'sandles', 14: 'boots', 15: 'wedges', 16: 'boots', 17: 'boots', 18: 'wedges', 19: 'wedges'}, 'shoe_material': {0: 'faux-leather', 1: 'faux-leather', 2: 'fabric', 3: 'faux-leather', 4: 'leather', 5: 'fabric', 6: 'fabric', 7: 'faux-leather', 8: 'leather', 9: 'leather', 10: 'leather', 11: 'fabric', 12: 'fabric', 13: 'fabric', 14: 'faux-leather', 15: 'fabric', 16: 'leather', 17: 'leather', 18: 'fabric', 19: 'fabric'}, 'shoe_color': {0: 'black', 1: 'navy', 2: 'black', 3: 'red', 4: 'brown', 5: 'black', 6: 'navy', 7: 'navy', 8: 'brown', 9: 'white', 10: 'red', 11: 'brown', 12: 'white', 13: 'red', 14: 'white', 15: 'red', 16: 'black', 17: 'white', 18: 'red', 19: 'red'}} )

orders.rename(columns={
    'name': 'First Name',
    'age': 'Age'},
    inplace=True)
    
shoe_source = lambda x: 'animal' if x.shoe_material == 'leather' else 'vegan'

orders['shoe_source'] = orders.apply(shoe_source, axis = 1)

salutation = lambda x: 'Dear Mr. ' + x.last_name if x.gender == 'male' else 'Dear Ms. ' + x.last_name

orders['salutation'] = orders.apply(salutation, axis = 1)

print(orders)

##        id  first_name   last_name  ... shoe_color shoe_source           salutation
## 0   54791     Rebecca     Lindsay  ...      black       vegan     Dear Ms. Lindsay
## 1   53450       Emily       Joyce  ...       navy       vegan       Dear Ms. Joyce
## 2   91987       Joyce      Waller  ...      black       vegan      Dear Ms. Waller
## 3   14437      Justin    Erickson  ...        red       vegan    Dear Mr. Erickson
## 4   79357      Andrew       Banks  ...      brown      animal       Dear Mr. Banks
## 5   52386       Julie       Marsh  ...      black       vegan       Dear Ms. Marsh
## 6   20487      Thomas      Jensen  ...       navy       vegan      Dear Mr. Jensen
## 7   76971      Janice       Hicks  ...       navy       vegan       Dear Ms. Hicks
## 8   21586     Gabriel      Porter  ...      brown      animal      Dear Mr. Porter
## 9   62083     Frances      Palmer  ...      white      animal      Dear Ms. Palmer
## 10  91629     Jessica        Hale  ...        red      animal        Dear Ms. Hale
## 11  98602    Lawrence      Parker  ...      brown       vegan      Dear Mr. Parker
## 12  45832       Susan      Dennis  ...      white       vegan      Dear Ms. Dennis
## 13  33862       Diane       Ochoa  ...        red       vegan       Dear Ms. Ochoa
## 14  73431     Rebecca     Charles  ...      white       vegan     Dear Ms. Charles
## 15  93889  Jacqueline       Crane  ...        red       vegan       Dear Ms. Crane
## 16  39888     Vincent  Stephenson  ...      black      animal  Dear Mr. Stephenson
## 17  35961         Roy     Tillman  ...      white      animal     Dear Mr. Tillman
## 18  24560      Thomas    Roberson  ...        red       vegan    Dear Mr. Roberson
## 19  28559      Angela      Newton  ...        red       vegan      Dear Ms. Newton
## 
## [20 rows x 10 columns]

Pedal Power Inventory

#import codecademylib3
#import pandas as pd

#inventory = pd.read_csv('inventory.csv')

inventory = pd.DataFrame( {'location': {0: 'Staten Island', 1: 'Staten Island', 2: 'Staten Island', 3: 'Staten Island', 4: 'Staten Island', 5: 'Staten Island', 6: 'Staten Island', 7: 'Staten Island', 8: 'Staten Island', 9: 'Staten Island', 10: 'Brooklyn', 11: 'Brooklyn', 12: 'Brooklyn', 13: 'Brooklyn', 14: 'Brooklyn', 15: 'Brooklyn', 16: 'Brooklyn', 17: 'Brooklyn', 18: 'Brooklyn', 19: 'Brooklyn', 20: 'Queens', 21: 'Queens', 22: 'Queens', 23: 'Queens', 24: 'Queens', 25: 'Queens', 26: 'Queens', 27: 'Queens', 28: 'Queens'}, 'product_type': {0: 'seeds', 1: 'seeds', 2: 'seeds', 3: 'garden tools', 4: 'garden tools', 5: 'garden tools', 6: 'pest_control', 7: 'pest_control', 8: 'planter', 9: 'planter', 10: 'seeds', 11: 'seeds', 12: 'seeds', 13: 'garden tools', 14: 'garden tools', 15: 'garden tools', 16: 'pest_control', 17: 'pest_control', 18: 'planter', 19: 'planter', 20: 'seeds', 21: 'seeds', 22: 'seeds', 23: 'garden tools', 24: 'garden tools', 25: 'garden tools', 26: 'pest_control', 27: 'pest_control', 28: 'planter'}, 'product_description': {0: 'daisy', 1: 'calla lily', 2: 'tomato', 3: 'rake', 4: 'wheelbarrow', 5: 'spade', 6: 'insect killer', 7: 'weed killer', 8: '20 inch terracotta planter', 9: '8 inch plastic planter', 10: 'daisy', 11: 'calla lily', 12: 'tomato', 13: 'rake', 14: 'wheelbarrow', 15: 'spade', 16: 'insect killer', 17: 'weed killer', 18: '20 inch terracotta planter', 19: '8 inch plastic planter', 20: 'daisy', 21: 'calla lily', 22: 'tomato', 23: 'rake', 24: 'wheelbarrow', 25: 'spade', 26: 'insect killer', 27: 'weed killer', 28: '20 inch terracotta planter'}, 'quantity': {0: 4, 1: 46, 2: 85, 3: 4, 4: 0, 5: 93, 6: 74, 7: 8, 8: 0, 9: 53, 10: 50, 11: 0, 12: 0, 13: 15, 14: 82, 15: 36, 16: 80, 17: 76, 18: 5, 19: 26, 20: 57, 21: 95, 22: 45, 23: 21, 24: 98, 25: 26, 26: 0, 27: 16, 28: 87}, 'price': {0: 6.99, 1: 19.99, 2: 13.99, 3: 13.99, 4: 89.99, 5: 19.99, 6: 12.99, 7: 23.99, 8: 17.99, 9: 3.99, 10: 6.99, 11: 19.99, 12: 13.99, 13: 13.99, 14: 89.99, 15: 19.99, 16: 12.99, 17: 23.99, 18: 17.99, 19: 3.99, 20: 6.99, 21: 19.99, 22: 13.99, 23: 13.99, 24: 89.99, 25: 19.99, 26: 12.99, 27: 23.99, 28: 17.99}} )

print(inventory.head())

##         location  product_type product_description  quantity  price
## 0  Staten Island         seeds               daisy         4   6.99
## 1  Staten Island         seeds          calla lily        46  19.99
## 2  Staten Island         seeds              tomato        85  13.99
## 3  Staten Island  garden tools                rake         4  13.99
## 4  Staten Island  garden tools         wheelbarrow         0  89.99

staten_island = inventory[:10]

product_request = staten_island.product_description 

seed_request = inventory.loc[(inventory['location'] == 'Brooklyn') & (inventory['product_type'] == 'seeds')]

inventory['in_stock'] = inventory.apply(lambda x: True if x.quantity > 0 else False, axis =1)

inventory['total_value'] = inventory.price * inventory.quantity

combine_lambda = lambda row: \
    '{} - {}'.format(row.product_type,
                     row.product_description)

inventory['full_description'] = inventory.apply(combine_lambda, axis = 1)

print(inventory)

##          location  ...                      full_description
## 0   Staten Island  ...                         seeds - daisy
## 1   Staten Island  ...                    seeds - calla lily
## 2   Staten Island  ...                        seeds - tomato
## 3   Staten Island  ...                   garden tools - rake
## 4   Staten Island  ...            garden tools - wheelbarrow
## 5   Staten Island  ...                  garden tools - spade
## 6   Staten Island  ...          pest_control - insect killer
## 7   Staten Island  ...            pest_control - weed killer
## 8   Staten Island  ...  planter - 20 inch terracotta planter
## 9   Staten Island  ...      planter - 8 inch plastic planter
## 10       Brooklyn  ...                         seeds - daisy
## 11       Brooklyn  ...                    seeds - calla lily
## 12       Brooklyn  ...                        seeds - tomato
## 13       Brooklyn  ...                   garden tools - rake
## 14       Brooklyn  ...            garden tools - wheelbarrow
## 15       Brooklyn  ...                  garden tools - spade
## 16       Brooklyn  ...          pest_control - insect killer
## 17       Brooklyn  ...            pest_control - weed killer
## 18       Brooklyn  ...  planter - 20 inch terracotta planter
## 19       Brooklyn  ...      planter - 8 inch plastic planter
## 20         Queens  ...                         seeds - daisy
## 21         Queens  ...                    seeds - calla lily
## 22         Queens  ...                        seeds - tomato
## 23         Queens  ...                   garden tools - rake
## 24         Queens  ...            garden tools - wheelbarrow
## 25         Queens  ...                  garden tools - spade
## 26         Queens  ...          pest_control - insect killer
## 27         Queens  ...            pest_control - weed killer
## 28         Queens  ...  planter - 20 inch terracotta planter
## 
## [29 rows x 8 columns]

Calculating Aggregate Functions

Calculate aggregate statistics (mean, standard deviation, median, percentiles, etc.) over certain subsets of the data.

df.groupby(‘column1’).column2.measurement()

orders = pd.DataFrame( {'id': {190: 57860, 191: 31063, 192: 18731, 193: 53962, 194: 50212, 195: 74787, 196: 15734, 197: 18956, 198: 55835, 199: 68900}, 'first_name': {190: 'Evelyn', 191: 'Debra', 192: 'Virginia', 193: 'Andrea', 194: 'Brenda', 195: 'James', 196: 'Dylan', 197: 'Tiffany', 198: 'Jeremy', 199: 'Eugene'}, 'last_name': {190: 'Holden', 191: 'Nichols', 192: 'Shepard', 193: 'Trevino', 194: 'Burnett', 195: 'Gill', 196: 'Hurst', 197: 'Stone', 198: 'Randall', 199: 'Kinney'}, 'email': {190: 'Evelyn.Holden@gmail.com', 191: 'DNichols1991@gmail.com', 192: 'VirginiaShepard73@gmail.com', 193: 'Andrea.Trevino@gmail.com', 194: 'BrendaBurnett83@hotmail.com', 195: 'JamesGill43@gmail.com', 196: 'Dylan.Hurst@gmail.com', 197: 'TStone1981@gmail.com', 198: 'JeremyRandall28@gmail.com', 199: 'EugeneKinney89@gmail.com'}, 'shoe_type': {190: 'ballet flats', 191: 'sandals', 192: 'sandals', 193: 'sandals', 194: 'sandals', 195: 'sandals', 196: 'wedges', 197: 'ballet flats', 198: 'sandals', 199: 'ballet flats'}, 'shoe_material': {190: 'fabric', 191: 'faux-leather', 192: 'fabric', 193: 'fabric', 194: 'fabric', 195: 'fabric', 196: 'leather', 197: 'leather', 198: 'fabric', 199: 'faux-leather'}, 'shoe_color': {190: 'brown', 191: 'navy', 192: 'white', 193: 'navy', 194: 'navy', 195: 'brown', 196: 'brown', 197: 'white', 198: 'brown', 199: 'black'}, 'price': {190: 477, 191: 478, 192: 480, 193: 482, 194: 484, 195: 485, 196: 488, 197: 492, 198: 498, 199: 498}} )

pricey_shoes = orders.groupby('shoe_type').price.max()

print(pricey_shoes)

## shoe_type
## ballet flats    498
## sandals         498
## wedges          488
## Name: price, dtype: int64

In order to get that, we can use reset_index(). This will transform our Series into a DataFrame and move the indices into their own column.

df.groupby(‘column1’).column2.measurement().reset_index()

pricey_shoes = orders.groupby('shoe_type').price.max().reset_index()

print(pricey_shoes)

##       shoe_type  price
## 0  ballet flats    498
## 1       sandals    498
## 2        wedges    488

print(type(pricey_shoes))

## <class 'pandas.core.frame.DataFrame'>

calculate other functions:

import numpy as np

cheap_shoes = orders.groupby('shoe_color').price \
    .apply(lambda x: np.percentile(x, 25)) \
    .reset_index()

print(cheap_shoes)

##   shoe_color  price
## 0      black  498.0
## 1      brown  483.0
## 2       navy  480.0
## 3      white  483.0

Group by more than one column

shoe_counts = orders.groupby(['shoe_type','shoe_color']).id.count().reset_index()

print(shoe_counts)

##       shoe_type shoe_color  id
## 0  ballet flats      black   1
## 1  ballet flats      brown   1
## 2  ballet flats      white   1
## 3       sandals      brown   2
## 4       sandals       navy   3
## 5       sandals      white   1
## 6        wedges      brown   1

Pivot Tables

df.pivot(columns=‘ColumnToPivot’, index=‘ColumnToBeRows’, values=‘ColumnToBeValues’)

shoe_counts_pivot = shoe_counts.pivot( 
  columns = 'shoe_color', 
  index = 'shoe_type' ,
  values = 'id').reset_index() 

print(shoe_counts_pivot)

## shoe_color     shoe_type  black  brown  navy  white
## 0           ballet flats    1.0    1.0   NaN    1.0
## 1                sandals    NaN    2.0   3.0    1.0
## 2                 wedges    NaN    1.0   NaN    NaN

user_visits = pd.DataFrame( {'id': {0: 10043, 1: 10150, 2: 10155, 3: 10178, 4: 10208, 5: 10260, 6: 10271, 7: 10278, 8: 10320, 9: 10389, 10: 10432, 11: 10511, 12: 10572, 13: 10672, 14: 10683, 15: 10717, 16: 10763, 17: 10788, 18: 10809, 19: 10845}, 'first_name': {0: 'Louis', 1: 'Bruce', 2: 'Nicholas', 3: 'William', 4: 'Karen', 5: 'Benjamin', 6: 'Gerald', 7: 'Melissa', 8: 'Adam', 9: 'Ethan', 10: 'Charles', 11: 'Scott', 12: 'Samantha', 13: 'Joyce', 14: 'Richard', 15: 'Louis', 16: 'Jesse', 17: 'Donald', 18: 'Olivia', 19: 'Daniel'}, 'last_name': {0: 'Koch', 1: 'Webb', 2: 'Hoffman', 3: 'Key', 4: 'Bass', 5: 'Ochoa', 6: 'Aguilar', 7: 'Lambert', 8: 'Strickland', 9: 'Payne', 10: 'Herrera', 11: 'Hines', 12: 'Townsend', 13: 'Ford', 14: 'Stanley', 15: 'Prince', 16: 'Booker', 17: 'Mcpherson', 18: 'Osborne', 19: 'Haney'}, 'email': {0: 'LouisKoch43@gmail.com', 1: 'BruceWebb44@outlook.com', 2: 'Nicholas.Hoffman@gmail.com', 3: 'William.Key@outlook.com', 4: 'KB4971@gmail.com', 5: 'Benjamin.Ochoa@outlook.com', 6: 'Gerald.Aguilar@gmail.com', 7: 'Melissa.Lambert@gmail.com', 8: 'Adam.Strickland@gmail.com', 9: 'EthanPayne26@outlook.com', 10: 'CH7703@gmail.com', 11: 'SHines1999@hotmail.com', 12: 'Samantha.Townsend@gmail.com', 13: 'JF8697@gmail.com', 14: 'RStanley1971@gmail.com', 15: 'LPrince1992@gmail.com', 16: 'JBooker1977@gmail.com', 17: 'DMcpherson1995@gmail.com', 18: 'Olivia.Osborne@gmail.com', 19: 'DHaney1985@gmail.com'}, 'month': {0: '3 - March', 1: '3 - March', 2: '2 - February', 3: '3 - March', 4: '2 - February', 5: '1 - January', 6: '3 - March', 7: '2 - February', 8: '3 - March', 9: '2 - February', 10: '3 - March', 11: '3 - March', 12: '2 - February', 13: '3 - March', 14: '3 - March', 15: '3 - March', 16: '1 - January', 17: '2 - February', 18: '1 - January', 19: '3 - March'}, 'utm_source': {0: 'yahoo', 1: 'twitter', 2: 'google', 3: 'yahoo', 4: 'google', 5: 'twitter', 6: 'google', 7: 'email', 8: 'email', 9: 'facebook', 10: 'yahoo', 11: 'yahoo', 12: 'google', 13: 'email', 14: 'email', 15: 'yahoo', 16: 'yahoo', 17: 'yahoo', 18: 'facebook', 19: 'yahoo'}} )

print(user_visits.head(5))

##       id first_name  ...         month utm_source
## 0  10043      Louis  ...     3 - March      yahoo
## 1  10150      Bruce  ...     3 - March    twitter
## 2  10155   Nicholas  ...  2 - February     google
## 3  10178    William  ...     3 - March      yahoo
## 4  10208      Karen  ...  2 - February     google
## 
## [5 rows x 6 columns]

click_source = user_visits.groupby('utm_source').count().reset_index()

print(click_source)

##   utm_source  id  first_name  last_name  email  month
## 0      email   4           4          4      4      4
## 1   facebook   2           2          2      2      2
## 2     google   4           4          4      4      4
## 3    twitter   2           2          2      2      2
## 4      yahoo   8           8          8      8      8

click_source_by_month = user_visits.groupby(['utm_source','month']).count().reset_index()

print(click_source_by_month)

##    utm_source         month  id  first_name  last_name  email
## 0       email  2 - February   1           1          1      1
## 1       email     3 - March   3           3          3      3
## 2    facebook   1 - January   1           1          1      1
## 3    facebook  2 - February   1           1          1      1
## 4      google  2 - February   3           3          3      3
## 5      google     3 - March   1           1          1      1
## 6     twitter   1 - January   1           1          1      1
## 7     twitter     3 - March   1           1          1      1
## 8       yahoo   1 - January   1           1          1      1
## 9       yahoo  2 - February   1           1          1      1
## 10      yahoo     3 - March   6           6          6      6

click_source_by_month_pivot = click_source_by_month.pivot(
  columns = 'month',
  index = 'utm_source',
  values = 'id').reset_index()

print(click_source_by_month_pivot)

## month utm_source  1 - January  2 - February  3 - March
## 0          email          NaN           1.0        3.0
## 1       facebook          1.0           1.0        NaN
## 2         google          NaN           3.0        1.0
## 3        twitter          1.0           NaN        1.0
## 4          yahoo          1.0           1.0        6.0

A/B testing in Python


# import codecademylib3
import pandas as pd
import numpy as np

# ad_clicks = pd.read_csv('ad_clicks.csv')

# ad_clicks = ad_clicks.replace({np.nan: None})
# print(f'ad_clicks = pd.DataFrame( {str(ad_clicks.head(100).to_dict())} )')

Use first 100 rows of data rather than loading codecademylib3


ad_clicks = pd.DataFrame( {'user_id': {0: '008b7c6c-7272-471e-b90e-930d548bd8d7', 1: '009abb94-5e14-4b6c-bb1c-4f4df7aa7557', 2: '00f5d532-ed58-4570-b6d2-768df5f41aed', 3: '011adc64-0f44-4fd9-a0bb-f1506d2ad439', 4: '012137e6-7ae7-4649-af68-205b4702169c', 5: '013b0072-7b72-40e7-b698-98b4d0c9967f', 6: '0153d85b-7660-4c39-92eb-1e1acd023280', 7: '01555297-d6e6-49ae-aeba-1b196fdbb09f', 8: '018cea61-19ea-4119-895b-1a4309ccb148', 9: '01a210c3-fde0-4e6f-8efd-4f0e38730ae6', 10: '01adb2e7-f711-4ae4-a7c6-29f48457eea1', 11: '01ae0361-7d17-4760-a2c8-23977a46fb78', 12: '01fb228a-9d28-4cde-932c-59b933fa763b', 13: '023598b8-09e2-40ed-9c90-34d607094ff9', 14: '02405d93-9c33-4034-894a-b9523956a3ad', 15: '0254b59f-082d-4a5a-913d-4f2bba267768', 16: '02d8dba0-5d12-4983-a407-63fab9757d94', 17: '0378e9e1-0ad8-4a26-8190-ebb3370239d3', 18: '041deef8-b242-4114-afd0-e584784ec9f0', 19: '0429608e-61f3-4df0-ba45-3633029a14db', 20: '04389894-b52c-4485-9266-435e9b9c0efd', 21: '0441f866-dd3f-422a-a36a-cdc2f034e6ef', 22: '04797f05-9cbe-48b6-8662-b9ee20828b0a', 23: '0482e1ab-b513-4bfb-9914-9fb5ab41c915', 24: '04ca737b-85f4-4194-8c38-d8d363b490f7', 25: '056f771f-e5ba-483f-a074-db2d7b94309d', 26: '05aec9ea-92c1-4ffc-84b5-c3864dd307e4', 27: '05c41c49-8521-4ad0-a1b5-724e99fe46ff', 28: '05d08d6d-62d5-4e28-8f10-107ae2cd03c7', 29: '05e8721a-cbb7-4c13-a115-eac7dd5ee7f6', 30: '06234d6f-8729-49d3-a39e-6bba740847e3', 31: '068ad376-b5c2-4825-afc0-2be657e1f4df', 32: '06a18971-a97b-48a4-bda6-3881960e3796', 33: '06a8681a-7de2-4053-8462-29df2b1e8aab', 34: '06aa1cc0-4366-4558-b74c-24bb797cff0b', 35: '06cf79f4-ac53-4b72-8736-829b8c63c7df', 36: '06da5468-c366-4388-b0ac-f84ea36949e0', 37: '07476cb4-247d-4fe2-b830-a399717bd07e', 38: '076b9eb7-a7c3-4319-a830-eb3344d88b6e', 39: '0771c92b-2ec8-463d-99fc-d4c31b7264b1', 40: '07760e06-0ff5-4f4b-a483-eaa46e504aca', 41: '0789f648-70fa-4711-8d1c-62ddde084aff', 42: '07cc1e2c-b5cc-4daf-95f2-2326021e7d52', 43: '081372d5-a1a8-4819-9bf1-911d4855360c', 44: '082ad0da-546b-4c00-b2f8-e21a5113e66a', 45: '083e3bbf-abe4-418c-8b17-7ac77b436ab1', 46: '0846d988-4b9e-4593-857b-594dabb8d5bb', 47: '084e8dc0-94df-4321-bbde-dc521b5c58d9', 48: '089cd63a-f7e2-4381-a864-b4378334885f', 49: '0928d4d2-9012-4806-99f7-06672ede99af', 50: '09347a02-3824-481b-a9aa-706047b3fcb0', 51: '097b056d-bad3-4fd0-a304-2c1a74bfc64e', 52: '099e073f-e629-4ba2-9979-47a8f117b771', 53: '0a046b08-1ed1-4ff6-b64c-176d1472dc9b', 54: '0a33a6d3-85ba-4120-88a6-63ce94df559d', 55: '0a8bcbfc-d71b-4407-8df3-d4d9a85b6fd5', 56: '0aa54825-7375-4b19-8622-d5e8ce41b73e', 57: '0b14202a-919f-4d9c-915c-82d0d4c13500', 58: '0b4a71b1-b264-4bc3-8ca3-d8c893e009ef', 59: '0bac51ca-724a-4e21-bd2e-2ccb9e426d76', 60: '0bece189-a24f-4dcf-888e-583381a26383', 61: '0bf24554-eac9-423d-8dc0-349e9629f422', 62: '0c1e88c0-dde5-400a-83fb-3b60f86ef62c', 63: '0c8fbfbf-38e6-4639-8aff-26f3dbd4492b', 64: '0cb5db50-1ded-4f68-8574-30b94677e4b8', 65: '0d16721b-99cc-4fda-8285-dc0675b93c26', 66: '0d4e8d25-6432-494d-9d25-616d165bdfa7', 67: '0d90321b-e549-4ceb-bc37-e8248a462863', 68: '0d92256f-4380-49c2-91d8-dab758556fe1', 69: '0db7af3c-1601-4cc5-b68d-7715927c6ce4', 70: '0dbb0270-1caf-420a-829f-ff1770ac865b', 71: '0dd20a2e-cb05-46c7-b14a-2e4a54d3dca3', 72: '0dd3a053-0fd2-4761-b588-ecfb2dc85983', 73: '0debfe68-f334-448e-8408-b2147eabac36', 74: '0e02cc50-6226-459c-8cf4-f727a2ab23ad', 75: '0e07be6f-4a36-4f75-98e7-460c4d2b740a', 76: '0e0b6077-26cb-4d81-908c-61d149c3af4a', 77: '0e79d58c-cf48-4635-ad09-324373a02b72', 78: '0e83486b-16ce-4565-9d08-16ebdb8e02e6', 79: '0e857013-279a-46cd-80e3-95e87b4ceb45', 80: '0ef8f17a-3710-4eca-a81f-9184202f6921', 81: '0f410c84-05f3-450c-8fdf-e514cece69e5', 82: '0f618ad1-8b42-4b89-9bac-e42e6c56d4bf', 83: '0f7c85ed-9ddc-4b4f-abdd-40aa5deda4a3', 84: '0f849a72-a35c-466a-845e-1fe31894e2ab', 85: '0faba4df-7bea-470a-8100-27b8aa7d7190', 86: '0fba2e02-2be7-4481-b3f0-cdf9d4529e1c', 87: '0fc9978b-273f-4ba0-8e27-5d60efb9af3b', 88: '0fe1e1f1-f3c7-4bd8-8f1d-33bf3c216d7f', 89: '0ffe21ef-31b3-43f4-97fe-22cb7e77f6a0', 90: '100ca263-d877-4fc2-91ba-c860ab59b4ad', 91: '1013d42e-cc4a-4bc1-9994-69ec076f9ae7', 92: '104dfffd-4431-44fd-96a9-579b47709989', 93: '109bf72e-9672-4588-b4d6-a042e6c42cc2', 94: '10d1b412-39dd-488e-b51c-da566e9edc17', 95: '113f979a-b4aa-47fb-ab14-b2ebb211ed40', 96: '1141b1f9-4e20-4b4f-a0c2-8f7e6bc2b6bd', 97: '114e067e-2f09-452c-a293-859cb8303326', 98: '114f097c-92fd-49ce-81bf-e1bdc79d2331', 99: '11b9794f-1c03-42f3-835b-a186246972e5'}, 'utm_source': {0: 'google', 1: 'facebook', 2: 'twitter', 3: 'google', 4: 'facebook', 5: 'facebook', 6: 'google', 7: 'google', 8: 'email', 9: 'email', 10: 'google', 11: 'facebook', 12: 'email', 13: 'google', 14: 'twitter', 15: 'google', 16: 'google', 17: 'facebook', 18: 'google', 19: 'google', 20: 'email', 21: 'google', 22: 'google', 23: 'twitter', 24: 'google', 25: 'google', 26: 'facebook', 27: 'facebook', 28: 'facebook', 29: 'email', 30: 'facebook', 31: 'email', 32: 'google', 33: 'facebook', 34: 'email', 35: 'google', 36: 'google', 37: 'twitter', 38: 'google', 39: 'google', 40: 'email', 41: 'google', 42: 'google', 43: 'twitter', 44: 'facebook', 45: 'google', 46: 'google', 47: 'facebook', 48: 'facebook', 49: 'google', 50: 'facebook', 51: 'google', 52: 'facebook', 53: 'facebook', 54: 'facebook', 55: 'facebook', 56: 'twitter', 57: 'google', 58: 'email', 59: 'email', 60: 'google', 61: 'facebook', 62: 'google', 63: 'google', 64: 'facebook', 65: 'google', 66: 'google', 67: 'google', 68: 'facebook', 69: 'facebook', 70: 'twitter', 71: 'twitter', 72: 'email', 73: 'google', 74: 'google', 75: 'facebook', 76: 'facebook', 77: 'email', 78: 'google', 79: 'google', 80: 'twitter', 81: 'facebook', 82: 'facebook', 83: 'facebook', 84: 'email', 85: 'google', 86: 'twitter', 87: 'facebook', 88: 'facebook', 89: 'facebook', 90: 'google', 91: 'google', 92: 'google', 93: 'google', 94: 'facebook', 95: 'facebook', 96: 'twitter', 97: 'email', 98: 'email', 99: 'google'}, 'day': {0: '6 - Saturday', 1: '7 - Sunday', 2: '2 - Tuesday', 3: '2 - Tuesday', 4: '7 - Sunday', 5: '1 - Monday', 6: '4 - Thursday', 7: '3 - Wednesday', 8: '1 - Monday', 9: '2 - Tuesday', 10: '3 - Wednesday', 11: '4 - Thursday', 12: '7 - Sunday', 13: '2 - Tuesday', 14: '2 - Tuesday', 15: '5 - Friday', 16: '3 - Wednesday', 17: '1 - Monday', 18: '3 - Wednesday', 19: '7 - Sunday', 20: '1 - Monday', 21: '7 - Sunday', 22: '6 - Saturday', 23: '6 - Saturday', 24: '2 - Tuesday', 25: '4 - Thursday', 26: '5 - Friday', 27: '7 - Sunday', 28: '2 - Tuesday', 29: '5 - Friday', 30: '6 - Saturday', 31: '3 - Wednesday', 32: '4 - Thursday', 33: '7 - Sunday', 34: '4 - Thursday', 35: '6 - Saturday', 36: '5 - Friday', 37: '7 - Sunday', 38: '5 - Friday', 39: '7 - Sunday', 40: '3 - Wednesday', 41: '1 - Monday', 42: '4 - Thursday', 43: '7 - Sunday', 44: '2 - Tuesday', 45: '2 - Tuesday', 46: '4 - Thursday', 47: '2 - Tuesday', 48: '7 - Sunday', 49: '3 - Wednesday', 50: '4 - Thursday', 51: '1 - Monday', 52: '2 - Tuesday', 53: '1 - Monday', 54: '4 - Thursday', 55: '3 - Wednesday', 56: '4 - Thursday', 57: '1 - Monday', 58: '5 - Friday', 59: '7 - Sunday', 60: '4 - Thursday', 61: '1 - Monday', 62: '1 - Monday', 63: '4 - Thursday', 64: '2 - Tuesday', 65: '4 - Thursday', 66: '7 - Sunday', 67: '6 - Saturday', 68: '1 - Monday', 69: '4 - Thursday', 70: '3 - Wednesday', 71: '3 - Wednesday', 72: '6 - Saturday', 73: '6 - Saturday', 74: '3 - Wednesday', 75: '3 - Wednesday', 76: '3 - Wednesday', 77: '2 - Tuesday', 78: '2 - Tuesday', 79: '7 - Sunday', 80: '1 - Monday', 81: '6 - Saturday', 82: '1 - Monday', 83: '3 - Wednesday', 84: '3 - Wednesday', 85: '2 - Tuesday', 86: '1 - Monday', 87: '1 - Monday', 88: '2 - Tuesday', 89: '5 - Friday', 90: '5 - Friday', 91: '6 - Saturday', 92: '7 - Sunday', 93: '3 - Wednesday', 94: '4 - Thursday', 95: '5 - Friday', 96: '2 - Tuesday', 97: '1 - Monday', 98: '5 - Friday', 99: '5 - Friday'}, 'ad_click_timestamp': {0: '7:18', 1: None, 2: None, 3: None, 4: None, 5: None, 6: None, 7: None, 8: '18:33', 9: '15:21', 10: None, 11: '7:11', 12: None, 13: None, 14: None, 15: None, 16: None, 17: None, 18: '10:54', 19: None, 20: None, 21: '11:16', 22: None, 23: '1:20', 24: None, 25: None, 26: '8:46', 27: None, 28: None, 29: None, 30: None, 31: None, 32: None, 33: '9:2', 34: None, 35: None, 36: None, 37: None, 38: None, 39: None, 40: None, 41: None, 42: None, 43: None, 44: None, 45: '3:21', 46: None, 47: None, 48: None, 49: None, 50: '16:52', 51: None, 52: None, 53: None, 54: None, 55: None, 56: '10:21', 57: None, 58: None, 59: None, 60: None, 61: '2:2', 62: None, 63: '12:23', 64: '4:8', 65: '5:27', 66: None, 67: None, 68: None, 69: None, 70: '13:46', 71: '10:24', 72: None, 73: None, 74: None, 75: None, 76: None, 77: None, 78: None, 79: '1:49', 80: None, 81: None, 82: None, 83: None, 84: None, 85: None, 86: None, 87: None, 88: None, 89: None, 90: None, 91: '5:1', 92: '10:14', 93: '21:18', 94: '17:44', 95: None, 96: None, 97: None, 98: None, 99: '19:30'}, 'experimental_group': {0: 'A', 1: 'B', 2: 'A', 3: 'B', 4: 'B', 5: 'A', 6: 'A', 7: 'A', 8: 'A', 9: 'B', 10: 'B', 11: 'B', 12: 'A', 13: 'B', 14: 'A', 15: 'A', 16: 'B', 17: 'B', 18: 'A', 19: 'B', 20: 'B', 21: 'A', 22: 'B', 23: 'A', 24: 'B', 25: 'A', 26: 'B', 27: 'B', 28: 'A', 29: 'B', 30: 'A', 31: 'B', 32: 'B', 33: 'A', 34: 'A', 35: 'B', 36: 'B', 37: 'B', 38: 'A', 39: 'B', 40: 'B', 41: 'A', 42: 'B', 43: 'B', 44: 'A', 45: 'B', 46: 'B', 47: 'A', 48: 'B', 49: 'B', 50: 'A', 51: 'B', 52: 'B', 53: 'A', 54: 'B', 55: 'B', 56: 'A', 57: 'B', 58: 'A', 59: 'B', 60: 'A', 61: 'A', 62: 'B', 63: 'A', 64: 'A', 65: 'A', 66: 'B', 67: 'A', 68: 'B', 69: 'A', 70: 'A', 71: 'B', 72: 'B', 73: 'A', 74: 'A', 75: 'B', 76: 'A', 77: 'B', 78: 'A', 79: 'B', 80: 'A', 81: 'A', 82: 'B', 83: 'B', 84: 'A', 85: 'A', 86: 'B', 87: 'B', 88: 'A', 89: 'A', 90: 'B', 91: 'A', 92: 'A', 93: 'A', 94: 'A', 95: 'A', 96: 'B', 97: 'A', 98: 'A', 99: 'A'}} )

print(ad_clicks.head())

##                                 user_id  ... experimental_group
## 0  008b7c6c-7272-471e-b90e-930d548bd8d7  ...                  A
## 1  009abb94-5e14-4b6c-bb1c-4f4df7aa7557  ...                  B
## 2  00f5d532-ed58-4570-b6d2-768df5f41aed  ...                  A
## 3  011adc64-0f44-4fd9-a0bb-f1506d2ad439  ...                  B
## 4  012137e6-7ae7-4649-af68-205b4702169c  ...                  B
## 
## [5 rows x 5 columns]

print(ad_clicks.groupby('utm_source').count())

##             user_id  day  ad_click_timestamp  experimental_group
## utm_source                                                      
## email            15   15                   2                  15
## facebook         32   32                   7                  32
## google           42   42                  11                  42
## twitter          11   11                   4                  11

ad_clicks['is_clicked'] = ad_clicks.ad_click_timestamp.notnull() 

print(ad_clicks.head())

##                                 user_id  ... is_clicked
## 0  008b7c6c-7272-471e-b90e-930d548bd8d7  ...       True
## 1  009abb94-5e14-4b6c-bb1c-4f4df7aa7557  ...      False
## 2  00f5d532-ed58-4570-b6d2-768df5f41aed  ...      False
## 3  011adc64-0f44-4fd9-a0bb-f1506d2ad439  ...      False
## 4  012137e6-7ae7-4649-af68-205b4702169c  ...      False
## 
## [5 rows x 6 columns]


print(ad_clicks.head())

##                                 user_id  ... is_clicked
## 0  008b7c6c-7272-471e-b90e-930d548bd8d7  ...       True
## 1  009abb94-5e14-4b6c-bb1c-4f4df7aa7557  ...      False
## 2  00f5d532-ed58-4570-b6d2-768df5f41aed  ...      False
## 3  011adc64-0f44-4fd9-a0bb-f1506d2ad439  ...      False
## 4  012137e6-7ae7-4649-af68-205b4702169c  ...      False
## 
## [5 rows x 6 columns]

clicks_by_source = ad_clicks.groupby(['utm_source', 'is_clicked']).user_id.count().reset_index()

print(clicks_by_source)

##   utm_source  is_clicked  user_id
## 0      email       False       13
## 1      email        True        2
## 2   facebook       False       25
## 3   facebook        True        7
## 4     google       False       31
## 5     google        True       11
## 6    twitter       False        7
## 7    twitter        True        4

clicks_pivot = clicks_by_source.pivot(
  columns = 'is_clicked',
  index = 'utm_source',
  values = 'user_id').reset_index()

print(clicks_pivot)

## is_clicked utm_source  False  True
## 0               email     13     2
## 1            facebook     25     7
## 2              google     31    11
## 3             twitter      7     4

note: no quotation marks around True and False


clicks_pivot['percent_clicked'] = clicks_pivot[True] / (clicks_pivot[True] + clicks_pivot[False])

print(clicks_pivot)

## is_clicked utm_source  False  True  percent_clicked
## 0               email     13     2         0.133333
## 1            facebook     25     7         0.218750
## 2              google     31    11         0.261905
## 3             twitter      7     4         0.363636

clicks_by_ad = ad_clicks.groupby(['experimental_group', 'is_clicked']).user_id.count().reset_index()

print(clicks_by_ad)

##   experimental_group  is_clicked  user_id
## 0                  A       False       33
## 1                  A        True       18
## 2                  B       False       43
## 3                  B        True        6

clicks_pivot_ad = clicks_by_ad.pivot(
  columns = 'is_clicked',
  index = 'experimental_group',
  values = 'user_id').reset_index()


clicks_pivot_ad['percent_clicked'] = clicks_pivot_ad[True] / (clicks_pivot_ad[True] + clicks_pivot_ad[False])

print(clicks_pivot_ad)

## is_clicked experimental_group  False  True  percent_clicked
## 0                           A     33    18         0.352941
## 1                           B     43     6         0.122449

a_clicks = ad_clicks[ad_clicks.experimental_group == 'A']
b_clicks = ad_clicks[ad_clicks.experimental_group == 'B']

a_clicks_by_day = a_clicks.groupby(['day', 'is_clicked']).user_id.count().reset_index()

a_clicks_pivot_by_day = a_clicks_by_day.pivot(
  columns = 'is_clicked',
  index = 'day',
  values = 'user_id').reset_index()


a_clicks_pivot_by_day['percent_clicked'] = a_clicks_pivot_by_day[True] / (a_clicks_pivot_by_day[True] + a_clicks_pivot_by_day[False])

print(a_clicks_pivot_by_day)

## is_clicked            day  False  True  percent_clicked
## 0              1 - Monday      5     2         0.285714
## 1             2 - Tuesday      8     1         0.111111
## 2           3 - Wednesday      4     3         0.428571
## 3            4 - Thursday      5     5         0.500000
## 4              5 - Friday      6     1         0.142857
## 5            6 - Saturday      4     3         0.428571
## 6              7 - Sunday      1     3         0.750000

b_clicks_by_day = b_clicks.groupby(['day', 'is_clicked']).user_id.count().reset_index()

b_clicks_pivot_by_day = b_clicks_by_day.pivot(
  columns = 'is_clicked',
  index = 'day',
  values = 'user_id').reset_index()


b_clicks_pivot_by_day['percent_clicked'] = b_clicks_pivot_by_day[True] / (b_clicks_pivot_by_day[True] + b_clicks_pivot_by_day[False])

print(b_clicks_pivot_by_day)

## is_clicked            day  False  True  percent_clicked
## 0              1 - Monday    9.0   NaN              NaN
## 1             2 - Tuesday    6.0   2.0         0.250000
## 2           3 - Wednesday    8.0   1.0         0.111111
## 3            4 - Thursday    4.0   1.0         0.200000
## 4              5 - Friday    3.0   1.0         0.250000
## 5            6 - Saturday    3.0   NaN              NaN
## 6              7 - Sunday   10.0   1.0         0.090909

Python

Calculating Aggregate Functions

Pivot Tables

A/B testing in Python

See also