Read CSV Files
The first step to any data science project is to import your data and, often, it will be in a Comma Separated Value (CSV) format. Use this template to reduce data cleaning tasks further in your notebook by efficiently importing CSV files. For example, you can specify columns and deal with null values and dates all in one function!
Begin by uploading your CSVs to this workspace!
Decorator functions modify the behaviour of other functions. They do this through 'closure' which enables a function to be redefined/remodelled from a previous function definition. In the decorator definition it is the returned nested function ('return wrapper') which is the new function.
def divider(a, b):
print ("Dividing {} by {} results in: {}".format(a,b, a/b))
#return a/b
#Definition of a decorator function
def switcher(func):
def wrapper(a,b):
print("> This Function now has switched inputs: {}".format(func.__name__))
return func(b, a)
return wrapper
#A function can be modified post definition by applying a modifying closure
divider(3,4)
new_divider = switcher(divider)
new_divider(3,4)
#A function can be decorated at definition to apply a modifying closure
@switcher
def decimaler(a,b):
"""Multiplies first number by 10 and adds second number"""
print("10 x {} + {} results in: {}".format(a, b, 10*a + b))
decimaler(3,8)
import time
def func_info(func):
"""A decorator that provides function information"""
def wrapper(*args, **kwargs):
print("Function name: {}".format(func.__name__))
print("Function docstring: {}".format(func.__doc__))
start = time.time()
result = func(*args, **kwargs)
end = time.time()
print("Function runtime: {}".format(end - start))
return result
return wrapper
@func_info
def decimaler(a,b):
"""Multiplies first number by 10 and adds second number"""
print("10 x {} + {} results in: {}".format(a, b, 10*a + b))
decimaler(5,7)
Context Managers Defining a context manager requires:
- The use of the decorator @contextlib.contextmanager
- A function header (eg. def xyz():)
- A set up script that runs before the yield(ing) of results
- A 'yield' statement (which can return nothing)
- A tear down script which performs after control is returned to context manager
Activate a context function using 'with' (include 'as' if the fn has anything to return)
Context functions can be nested as seen in the timer/open combo below.
import time
import contextlib
import pandas as pd
file = "data/sheffield_weather_station.csv"
@contextlib.contextmanager
def timer():
start = time.time()
print("Timer Started.")
yield
finish = time.time()
print("Time taken is {}".format(finish - start))
@contextlib.contextmanager
def read_only_file(str):
"""Open a file in read-only mode"""
file = open(str, "r")
yield file
file.close()
#Apply two context managers to a function: timer() and read_only_file()
with timer():
with read_only_file(file) as f:
line_count = 0
for line in f:
line_count += 1
if line[0] == "#":
print(line)
print("File line count: {}".format(line_count))
print(read_only_file.__doc__)
A context manager that opens a file and reads and writes one line at a time (this prevents a failure caused by opening an over-sized file)
import contextlib
file = "data/sheffield_weather_station.csv"
outfile = "out.csv"
@contextlib.contextmanager
def read_file(file):
"""Open a file in read mode, then close it"""
f = open(file, "r")
yield f
f.close()
@contextlib.contextmanager
def write_file(file):
"""Open a file in write mode, then close it"""
f = open(file, "w")
yield f
f.close()
with read_file(file) as in_f:
with write_file(outfile) as out_f:
for line in in_f:
if line[0] != "#":
out_f.write(line)
else:
print(line)
import pandas as pd
df = pd.read_csv(
"data/sheffield_weather_station.csv", # Replace with your CSV file path
# The following arguments are optional and can be removed:
# If columns aren't separated by commas, indicate the delimiter here
sep="\s+",
# Indicate which zero-indexed row number(s) have the column names
header=0,
# List of column names to use (useful for renaming columns)
names=["year", "month", "max_c", "min_c", "af", "rain", "sun"],
# If not all columns are needed, indicate which you need (useful for lower memory usage)
usecols=["year", "month", "max_c", "min_c", "rain", "sun"],
# Indicate which column(s) to use as row labels
index_col=["year", "month"],
# Lines starting with this string should be ignored (useful if there are file comments)
comment="#",
# Indicate the number of lines to skip at the start of the file (also useful for file comments)
skiprows=None,
# Indicate string(s) that should be recognized as NaN/NA
na_values=["---", "unknown", "no info"],
# Indicate which column(s) are date column(s)
parse_dates=False,
# Indicate number of rows to read (useful for large files)
nrows=500,
# Encoding to use when reading file
encoding="utf-8",
)
df.head(10) # Preview the first 10 lines
# Start analyzing your DataFrame!
For more information on arguments, visit pandas' read_csv()
documentation.