from datascience import *
import numpy as np
pups = Table.read_table('data/pups.csv')
pups
| name | age | size | 
|---|---|---|
| Junior Smith | 11 | medium | 
| Rex Rogers | 7 | big | 
| Flash Heat | 3 | big | 
| Reese Bo | 4 | medium | 
| Polo Cash | 2 | small | 
pups.with_columns(
    'human years', pups.column('age') * 7
)
| name | age | size | human years | 
|---|---|---|---|
| Junior Smith | 11 | medium | 77 | 
| Rex Rogers | 7 | big | 49 | 
| Flash Heat | 3 | big | 21 | 
| Reese Bo | 4 | medium | 28 | 
| Polo Cash | 2 | small | 14 | 
def seven_times(x):
    return 7 * x
pups.apply(seven_times, 'age')
array([77, 49, 21, 28, 14])
Note, we wouldn't actually use the above example since we could just write pups.column('age') * 7.
Here's a more useful example:
def email_from_name(name):
    first, last = name.split(' ')
    email = first + '.' + last + '@dogschool.edu'
    return email.lower()
# Can use email_from_name on a single argument
email_from_name('Champ Major')
'champ.major@dogschool.edu'
pups.apply(email_from_name, 'name')
array(['junior.smith@dogschool.edu', 'rex.rogers@dogschool.edu',
       'flash.heat@dogschool.edu', 'reese.bo@dogschool.edu',
       'polo.cash@dogschool.edu'], dtype='<U26')
pups.with_columns('email', pups.apply(email_from_name, 'name'))
| name | age | size | |
|---|---|---|---|
| Junior Smith | 11 | medium | junior.smith@dogschool.edu | 
| Rex Rogers | 7 | big | rex.rogers@dogschool.edu | 
| Flash Heat | 3 | big | flash.heat@dogschool.edu | 
| Reese Bo | 4 | medium | reese.bo@dogschool.edu | 
| Polo Cash | 2 | small | polo.cash@dogschool.edu | 
# Note, the parameter names don't
# need to be 'age' and 'size'
def human_years_converter(years_old, kind):
    if kind == 'small':
        return years_old * 6
    elif kind == 'medium':
        return years_old * 7
    else:
        return years_old * 8
human_years_converter(11, 'medium')
77
human_years_converter(11, 'small')
66
pups.apply(human_years_converter, 'age', 'size')
array([77, 56, 24, 28, 12])
pups.with_columns('accurate human years', pups.apply(human_years_converter, 'age', 'size'))
| name | age | size | accurate human years | 
|---|---|---|---|
| Junior Smith | 11 | medium | 77 | 
| Rex Rogers | 7 | big | 56 | 
| Flash Heat | 3 | big | 24 | 
| Reese Bo | 4 | medium | 28 | 
| Polo Cash | 2 | small | 12 | 
# Large file – this may take ~10 seconds to load
salary = Table.read_table('https://media.githubusercontent.com/media/dailycal-projects/ucb-faculty-salary/master/data/salary/salary_2015.csv')
salary
| year | location | first | last | title | gross | regular | overtime | other | 
|---|---|---|---|---|---|---|---|---|
| 2015 | Berkeley | ANNE | AABOE | BUS SYS ANL 4 | 124,454 | 124,454 | 0 | 0 | 
| 2015 | Berkeley | DAVID | AAKER | RECALL FACULTY | 2,500 | 0 | 0 | 2,500 | 
| 2015 | Berkeley | ELIZABETH | ABEL | PROF-AY | 138,775 | 138,775 | 0 | 0 | 
| 2015 | Berkeley | NORMAN | ABRAHAMSON | ADJ PROF-AY-1/9-B/E/E | 19,668 | 19,668 | 0 | 0 | 
| 2015 | Berkeley | BARBARA | ABRAMS | PROF-AY | 191,162 | 169,862 | 0 | 21,300 | 
| 2015 | Berkeley | JOHN | ACZON | FINANCIAL SVC ANL 3 | 83,510 | 78,510 | 0 | 5,000 | 
| 2015 | Berkeley | ANTHONY | ADAMS | RES-FY | 9,587 | 9,587 | 0 | 0 | 
| 2015 | Berkeley | PENNY | HINES | ACAD HR ANL 5 | 126,707 | 126,707 | 0 | 0 | 
| 2015 | Berkeley | ANINDITA | ADHIKARI | SR LECT SOE-AY | 107,345 | 107,345 | 0 | 0 | 
| 2015 | Berkeley | ILAN | ADLER | PROF-AY-B/E/E | 166,617 | 151,617 | 0 | 15,000 | 
... (281504 rows omitted)
profs = salary.select('first', 'last', 'title', 'gross').where('title', are.containing('PROF'))
profs
| first | last | title | gross | 
|---|---|---|---|
| ELIZABETH | ABEL | PROF-AY | 138,775 | 
| NORMAN | ABRAHAMSON | ADJ PROF-AY-1/9-B/E/E | 19,668 | 
| BARBARA | ABRAMS | PROF-AY | 191,162 | 
| ILAN | ADLER | PROF-AY-B/E/E | 166,617 | 
| VINOD | AGGARWAL | PROF-AY | 167,525 | 
| ALICE | AGOGINO | PROF-AY-B/E/E | 243,259 | 
| DAVID | ALDOUS | PROF-AY | 218,666 | 
| RONELLE | ALEXANDER | PROF-AY | 167,642 | 
| NEZAR | ALSAYYAD | PROF-AY | 210,389 | 
| GENEVIEVE | AMES | ADJ PROF-AY | 9,783 | 
... (17564 rows omitted)
Look at the very last row of the output – that gross income doesn't look right.
profs.sort('gross', descending = True)
| first | last | title | gross | 
|---|---|---|---|
| STEVEN H | APPLEBAUM | HS ASSOC CLIN PROF-HCOMP | 999,756 | 
| JOHN A | GLASPY | PROF-HCOMP | 999,631 | 
| FRANK P.K. | HSU | PROF OF CLIN-HCOMP | 998,340 | 
| JOHN STUART | NELSON | PROF-HCOMP | 997,975 | 
| HANMIN | LEE | PROF OF CLIN-HCOMP | 995,434 | 
| DENNIS J | SLAMON | PROF-HCOMP | 991,973 | 
| BENJAMIN J | ANSELL | HS CLIN PROF-HCOMP | 991,543 | 
| NICHOLAS C | SAENZ | HS CLIN PROF-HCOMP | 991,463 | 
| JOSEPH F | GRECO | HS ASST CLIN PROF-HCOMP | 991,458 | 
| OMRI Y. | MARIAN | ACT PROF-AY-LAW | 99,997 | 
... (17564 rows omitted)
It's because the entries in the 'gross' column are strings, not integers.
profs.column('gross').item(0)
'138,775'
Your job is to fix that!
def fix_income(income):
    return int(income.replace(',', ''))
fixed_income = profs.apply(fix_income, 'gross')
profs = profs.with_columns(
    'gross', fixed_income
)
profs
| first | last | title | gross | 
|---|---|---|---|
| ELIZABETH | ABEL | PROF-AY | 138775 | 
| NORMAN | ABRAHAMSON | ADJ PROF-AY-1/9-B/E/E | 19668 | 
| BARBARA | ABRAMS | PROF-AY | 191162 | 
| ILAN | ADLER | PROF-AY-B/E/E | 166617 | 
| VINOD | AGGARWAL | PROF-AY | 167525 | 
| ALICE | AGOGINO | PROF-AY-B/E/E | 243259 | 
| DAVID | ALDOUS | PROF-AY | 218666 | 
| RONELLE | ALEXANDER | PROF-AY | 167642 | 
| NEZAR | ALSAYYAD | PROF-AY | 210389 | 
| GENEVIEVE | AMES | ADJ PROF-AY | 9783 | 
... (17564 rows omitted)
type(profs.column('gross')[0])
numpy.int64
grade_bins = {
    'A+': 97,
    'A': 92,
    'B+': 85,
    'B': 79,
    'C+': 74,
    'C': 68,
    'D+': 58,
    'D': 50,
    'F': 0
}
def pct_to_letter(pct):
    for letter in grade_bins.keys():
        if pct >= grade_bins[letter]:
            return letter
pct_to_letter(59)
'D+'
pct_to_letter(98)
'A+'
gradebook = Table().with_columns(
    'Name', np.array(['Carrera', 'Panamera', 'Taycan', 'Cayenne', 'Macan', 'Cayman', 'Boxster']),
    'Grading Option', np.array(['GRD', 'PNP', 'PNP', 'GRD', 'GRD', 'GRD', 'PNP']),
    'Score', np.array([98, 86, 67.5, 45, 82, 88, 71])
)
gradebook
| Name | Grading Option | Score | 
|---|---|---|
| Carrera | GRD | 98 | 
| Panamera | PNP | 86 | 
| Taycan | PNP | 67.5 | 
| Cayenne | GRD | 45 | 
| Macan | GRD | 82 | 
| Cayman | GRD | 88 | 
| Boxster | PNP | 71 | 
gradebook.apply(pct_to_letter, 'Score')
array(['A+', 'B+', 'D+', 'F', 'B', 'B+', 'C'], dtype='<U2')
What if we want to factor in grading options?
def pct_to_letter_option(pct, option):
    # If the student is enrolled for a letter grade
    # call our function pct_to_letter
    if option == 'GRD':
        return pct_to_letter(pct)
    # Otherwise, check to see if they have at least a C-
    # (C here because our bins don't have a C-)
    else:
        if pct >= grade_bins['C']:
            return 'P'
        else:
            return 'NP'
gradebook.apply(pct_to_letter_option, 'Score', 'Grading Option')
array(['A+', 'P', 'NP', 'F', 'B', 'B+', 'P'], dtype='<U2')
gradebook = gradebook.with_columns(
    'Letter Grade', gradebook.apply(pct_to_letter_option, 'Score', 'Grading Option')
)
gradebook
| Name | Grading Option | Score | Letter Grade | 
|---|---|---|---|
| Carrera | GRD | 98 | A+ | 
| Panamera | PNP | 86 | P | 
| Taycan | PNP | 67.5 | NP | 
| Cayenne | GRD | 45 | F | 
| Macan | GRD | 82 | B | 
| Cayman | GRD | 88 | B+ | 
| Boxster | PNP | 71 | P | 
numbers = np.array([15, 14, -2, 1, 9])
numbers[[True, False, False, True, False]]
array([15, 1])
gradebook
| Name | Grading Option | Score | Letter Grade | 
|---|---|---|---|
| Carrera | GRD | 98 | A+ | 
| Panamera | PNP | 86 | P | 
| Taycan | PNP | 67.5 | NP | 
| Cayenne | GRD | 45 | F | 
| Macan | GRD | 82 | B | 
| Cayman | GRD | 88 | B+ | 
| Boxster | PNP | 71 | P | 
gradebook.where([True, True, False, True, True, False, False])
| Name | Grading Option | Score | Letter Grade | 
|---|---|---|---|
| Carrera | GRD | 98 | A+ | 
| Panamera | PNP | 86 | P | 
| Cayenne | GRD | 45 | F | 
| Macan | GRD | 82 | B | 
Run the following cell – ignore the lambda parts:
countries = Table.read_table('data/countries.csv')
countries = countries.relabeled('Country(or dependent territory)', 'Country') \
           .relabeled('% of world', '%') \
           .relabeled('Source(official or UN)', 'Source')
countries = countries.with_columns(
    'Country', countries.apply(lambda s: s[:s.index('[')].lower() if '[' in s else s.lower(), 'Country'),
    'Population', countries.apply(lambda i: int(i.replace(',', '')), 'Population'),
    '%', countries.apply(lambda f: float(f.replace('%', '')), '%')
)
countries
| Rank | Country | Population | % | Date | Source | 
|---|---|---|---|---|---|
| 1 | china | 1405936040 | 17.9 | 27 Dec 2020 | National population clock[3] | 
| 2 | india | 1371366679 | 17.5 | 27 Dec 2020 | National population clock[4] | 
| 3 | united states | 330888778 | 4.22 | 27 Dec 2020 | National population clock[5] | 
| 4 | indonesia | 269603400 | 3.44 | 1 Jul 2020 | National annual projection[6] | 
| 5 | pakistan | 220892331 | 2.82 | 1 Jul 2020 | UN Projection[2] | 
| 6 | brazil | 212523810 | 2.71 | 27 Dec 2020 | National population clock[7] | 
| 7 | nigeria | 206139587 | 2.63 | 1 Jul 2020 | UN Projection[2] | 
| 8 | bangladesh | 169885314 | 2.17 | 27 Dec 2020 | National population clock[8] | 
| 9 | russia | 146748590 | 1.87 | 1 Jan 2020 | National annual estimate[9] | 
| 10 | mexico | 127792286 | 1.63 | 1 Jul 2020 | National annual projection[10] | 
... (232 rows omitted)
def starts_or_ends_with_a(name):
    return name[0] == 'a' or name[-1] == 'a'
countries.apply(starts_or_ends_with_a, 'Country')
array([ True,  True, False,  True, False, False,  True, False,  True,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False,  True, False,  True, False,  True,
        True,  True, False,  True,  True, False, False,  True, False,
       False,  True, False, False,  True,  True,  True, False,  True,
        True, False, False, False,  True, False, False,  True,  True,
       False, False, False,  True, False, False, False,  True, False,
       False,  True, False, False,  True,  True, False, False,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True, False,  True, False, False, False, False, False, False,
        True, False, False, False, False, False, False,  True,  True,
       False, False, False, False, False, False,  True,  True,  True,
       False, False, False,  True, False, False, False, False, False,
       False,  True, False,  True, False, False, False,  True, False,
       False,  True,  True,  True,  True,  True, False,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True, False,  True,  True, False,  True, False, False, False,
        True, False,  True, False, False, False, False, False, False,
       False, False,  True, False, False, False, False,  True, False,
       False,  True,  True, False, False, False, False, False, False,
       False, False, False, False,  True, False,  True,  True, False,
       False, False, False,  True,  True, False, False,  True, False,
        True, False,  True, False,  True,  True, False, False, False,
        True, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False])
countries.where(countries.apply(starts_or_ends_with_a, 'Country'))
| Rank | Country | Population | % | Date | Source | 
|---|---|---|---|---|---|
| 1 | china | 1405936040 | 17.9 | 27 Dec 2020 | National population clock[3] | 
| 2 | india | 1371366679 | 17.5 | 27 Dec 2020 | National population clock[4] | 
| 4 | indonesia | 269603400 | 3.44 | 1 Jul 2020 | National annual projection[6] | 
| 7 | nigeria | 206139587 | 2.63 | 1 Jul 2020 | UN Projection[2] | 
| 9 | russia | 146748590 | 1.87 | 1 Jan 2020 | National annual estimate[9] | 
| 13 | ethiopia | 109612120 | 1.4 | 1 Jul 2020 | National annual projection[13] | 
| 23 | south africa | 59622350 | 0.761 | 1 Jul 2020 | National annual estimate[23] | 
| 25 | tanzania | 57637628 | 0.735 | 1 Jul 2020 | National annual projection[25] | 
| 27 | south korea | 51834302 | 0.661 | 1 Nov 2020 | Monthly national estimate[27] | 
| 28 | colombia | 50372424 | 0.643 | 30 Jun 2020 | National annual projection[28] | 
... (72 rows omitted)