Beta
Intermediate Importing Data in Python
Importing the packages and datasets
# Importing the course packages
import json
import pandas as pd
# Read the Twitter data
tweets_data = []
tweets_file = open("tweets.txt", "r")
for line in tweets_file:
tweet = json.loads(line)
tweets_data.append(tweet)
tweets_file.close()
# Import the other two datasets
wine = pd.read_csv("datasets/winequality-red.csv", sep=";")
latitude = pd.read_excel("datasets/latitude.xls")
1.1.1 Importing flat files from the web
- I'm about to import my first file from the web!
- The flat file i'll import will be 'winequality-red.csv' from the University of California, Irvine's Machine Learning repository.
- The flat file contains tabular data of physiochemical properties of red wine, such as pH, alcohol content and citric acid content, along with wine quality rating.
# Import package
from urllib.request import urlretrieve
# Import pandas
import pandas as pd
# Assign url of file: url
url = 'https://assets.datacamp.com/production/course_1606/datasets/winequality-red.csv'
# Save file locally
urlretrieve(url,'winequality-red.csv')
# Read file into a DataFrame and print its head
df = pd.read_csv('winequality-red.csv', sep=';')
print(df.head())
1.1.2 Opening and reading flat files from the web
- Imported a file from the web, saved it locally and loaded it into a DataFrame.
- To load a file from the web into a DataFrame without first saving it locally.
- Use the function pd.read_csv() with the URL as the first argument and the separator sep as the second argument
# Import packages
import matplotlib.pyplot as plt
import pandas as pd
# Assign url of file: url
url = 'https://assets.datacamp.com/production/course_1606/datasets/winequality-red.csv'
# Read file into a DataFrame: df
df = pd.read_csv(url , sep= ';')
# Print the head of the DataFrame
print(df.head())
# Plot first column of df
df.iloc[:, 0].hist()
plt.title('Count of fixed acidity in the Red Wine')
plt.xlabel('fixed acidity (g(tartaric acid)/dm$^3$)')
plt.ylabel('count')
plt.show()
Add your notes here
1.1.3 Importing non-flat files from the web
- pd.read_excel() to import an Excel spreadsheet.
# Import package
import pandas as pd
# Assign url of file: url
url = 'https://assets.datacamp.com/course/importing_data_into_r/latitude.xls'
# Read in all sheets of Excel file: xls
xls = pd.read_excel(url,sheet_name=None)
# Print the sheetnames to the shell
print(xls.keys())
# Print the head of the first sheet (using its name, NOT its index)
print(xls['1700'].head())
# Add your code snippets here
1.2.1 Performing HTTP requests in Python using urllib
# Import packages
from urllib.request import urlopen,Request
# Specify the url
url = "https://campus.datacamp.com/courses/1606/4135?ex=2"
# This packages the request: request
request = Request(url)
# Sends the request and catches the response: response
response = urlopen(request)
# Print the datatype of response
print(type(response))
# Be polite and close the response!
response.close()
1.2.2 Printing HTTP request results in Python using urllib
# Import packages
from urllib.request import urlopen, Request
# Specify the url
url = "https://campus.datacamp.com/courses/1606/4135?ex=2"
# This packages the request
request = Request(url)
# Sends the request and catches the response: response
response = urlopen(request)
# Extract the response: html
html = response.read()
# Print the html
print(html)
# Be polite and close the response!
response.close()
1.2.3 Performing HTTP requests in Python using requests
# Import package
import requests
# Specify the url: url
url = "http://www.datacamp.com/teach/documentation"
# Packages the request, send the request and catch the response: r
r = requests.get(url)
# Extract the response: text
text = r.text
# Print the html
print(text)
1.3.1 Parsing HTML with BeautifulSoup
- Use the BeautifulSoup package to parse, prettify and extract information from HTML.
- Scrape the data from the webpage of Guido van Rossum, Python's very own Benevolent Dictator for Life.
# Import packages
import requests
from bs4 import BeautifulSoup
# Specify url: url
url= url = 'https://www.python.org/~guido/'
# Package the request, send the request and catch the response: r
r = requests.get(url)
# Extracts the response as html: html_doc
html_doc = r.text
# Create a BeautifulSoup object from the HTML: soup
soup = BeautifulSoup(html_doc)
# Prettify the BeautifulSoup object: pretty_soup
pretty_soup = soup.prettify()
# Print the response
print(pretty_soup)
1.3.2 Turning a webpage into data using BeautifulSoup: getting the text
- Extract the text from the BDFL's webpage, along with printing the webpage's title.
# Import packages
import requests
from bs4 import BeautifulSoup
# Specify url: url
url = 'https://www.python.org/~guido/'
# Package the request, send the request and catch the response: r
r = requests.get(url)
# Extract the response as html: html_doc
html_doc = r.text
# Create a BeautifulSoup object from the HTML: soup
soup = BeautifulSoup(html_doc)
# Get the title of Guido's webpage: guido_title
guido_title = soup.title
# Print the title of Guido's webpage to the shell
print(guido_title)
# Get Guido's text: guido_text
guido_text = soup.get_text()
# Print Guido's text to the shell
print(guido_text)
1.3.3 Turning a webpage into data using BeautifulSoup: getting the hyperlinks
- Extract the URLs of the hyperlinks from the BDFL's webpage
# Import packages
import requests
from bs4 import BeautifulSoup
# Specify url
url = 'https://www.python.org/~guido/'
# Package the request, send the request and catch the response: r
r = requests.get(url)
# Extracts the response as html: html_doc
html_doc = r.text
# create a BeautifulSoup object from the HTML: soup
soup = BeautifulSoup(html_doc)
# Print the title of Guido's webpage
print(soup.title)
# Find all 'a' tags (which define hyperlinks): a_tags
a_tags = soup.find_all('a')
# Print the URLs to the shell
for link in a_tags:
print(link.get('href'))