print('This notebook was prepared using:')

import sys, os
print('python version %s' % sys.version)

import numpy as np
print('numpy version %s' % np.__version__)

my_params = dict(my_number=2, x=7, filename='myfile.dat')  
my_function(**my_params)

import my_code

raw_path = 'files/raw/my_data.csv'  
processed_path = 'files/processed/processed_data.csv'  
output_path = 'files/analysis/my_results.csv'  
my_parameter = 7

my_code.process_data(in_file=raw_path, out_file=processed_path, setting=my_parameter)
my_code.generate_results(in_file=processed_path, out_file=output_path)
my_code.plot_results(in_file=output_path)

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot
import seaborn as sns

df = sns.load_dataset('iris')
df.head(10)

df.columns

len(df)

sns.jointplot(data=df, x='petal_length', y='sepal_length');

print(np.unique(df.species))

df_setosa     = df[df.species=='setosa']
df_versicolor = df[df.species=='versicolor']
df_virginica  = df[df.species=='virginica']

sns.jointplot(data=df_setosa, x='petal_length', y='sepal_length');

sns.pairplot(data=df, hue='species');

df = sns.load_dataset('planets')
df.head(10)

sns.jointplot(data=df, x='year', y='distance')
plot.yscale('log')

sns.catplot(data=df, x='year', kind='count', aspect=4, hue='method');

import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/hadley/tidy-data/master/data/tb.csv')
df.head(10)

df = pd.melt(df, id_vars=['iso2', 'year'], value_name='cases', var_name='sex_and_age')
df.head(10)

# Parse the column names to extract sex, age upper and lower bounds
tmp_df = df['sex_and_age'].str.extract('(\D)(\d+)(\d{2})')    

# Add new columns for these variables
tmp_df.columns = ['sex', 'age_lower', 'age_upper']

# Create a single `age` column using `age_lower` and `age_upper`
tmp_df['age'] = tmp_df['age_lower'] + '-' + tmp_df['age_upper']

# Merge the data frames together
df = pd.concat([df, tmp_df], axis=1)

df.head(10)

# Drop unnecessary columns and rows
df = df.drop(['sex_and_age', 'age_lower', 'age_upper'], axis=1)
df = df.dropna()

# Rename `iso2` to `country`
df = df.rename(index=str, columns={'iso2': 'country'})

# Sort the data frame
df = df.sort_values(['country', 'year', 'sex', 'age'], ascending=True)
df.head(10)

Data analysis principles and `pandas`¶

Questions about data¶

Project¶

A case study¶

What does it mean?¶

A better way: tidy data¶

Tidy data¶

Best practices for data analysis¶

Jupyter notebook example¶

More guidelines¶

(Highly condensed) Jupyter notebook example¶

Comma-separated values (CSV)¶

`pandas`¶

Example: the `iris` dataset in `seaborn`¶

Basic data frame checks¶

Showing the relationships between variables¶

Selecting subsets¶

Visualizing multiple variables at once¶

A quick test on another data set¶

Discovery methods vs. time¶

Example: Tidying the TB dataset¶

'Melting' the data set¶

Parsing the data¶

Reformatting and generating useful variables¶

Project¶

For next time¶

Data analysis principles and pandas¶

Questions about data¶

Project¶

A case study¶

What does it mean?¶

A better way: tidy data¶

Tidy data¶

Best practices for data analysis¶

Jupyter notebook example¶

More guidelines¶

(Highly condensed) Jupyter notebook example¶

Comma-separated values (CSV)¶

pandas¶

Example: the iris dataset in seaborn¶

Basic data frame checks¶

Showing the relationships between variables¶

Selecting subsets¶

Visualizing multiple variables at once¶

A quick test on another data set¶

How is the year the planet was discovered related to its distance from the solar system?¶

Discovery methods vs. time¶

Example: Tidying the TB dataset¶

'Melting' the data set¶

Parsing the data¶

Reformatting and generating useful variables¶

Project¶

For next time¶

Data analysis principles and `pandas`¶

`pandas`¶

Example: the `iris` dataset in `seaborn`¶