Beta
Table of Contents
The outline of your notebook will show up here. You can include headings in any text cell by starting a line with #
, ##
, ###
, etc., depending on the desired title hierarchy.
# Start coding here...
import numpy as np
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
base_url = "http://www.uh.edu/engines/epi"
web_data = []
for i in range(100):
url = base_url + str(i+1) + ".htm"
soup = BeautifulSoup(requests.get(url).text)
web_data.append(soup)
display(web_data[0])
demo_data = web_data[0]
title = demo_data.title.text
meta_list = demo
display(title)
display(demo_data.head.find_all('meta'))
titles = []
authors = []
meta_headers = []
body = []
for i in range(len(web_data)):
titles.append(web_data[i].title.text)
meta_headers.append(web_data[i].head.find_all('meta'))
body.append(web_data[i].body.text)
titles = [re.sub(' +', ' ', re.sub('\n+', ' ', re.sub('\r+', ' ', i))).strip().lower() for i in titles]
display(titles[0])
display(meta_headers[0])
display(body[0])