Ray Eichler
‌
‌
‌
‌
‌
‌
‌
‌
‌
‌
‌
‌
‌
‌
Sign up
Beta
Spinner
# Start coding here... 
import numpy as np
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
base_url = "http://www.uh.edu/engines/epi"
web_data = []

for i in range(100):
    url = base_url + str(i+1) + ".htm"
    soup = BeautifulSoup(requests.get(url).text)
    web_data.append(soup)
    
display(web_data[0])
demo_data = web_data[0]
title = demo_data.title.text
meta_list = demo
display(title)
display(demo_data.head.find_all('meta'))
titles = []
authors = []
meta_headers = []
body = []
for i in range(len(web_data)):
    titles.append(web_data[i].title.text)
    meta_headers.append(web_data[i].head.find_all('meta'))
    body.append(web_data[i].body.text)

titles = [re.sub(' +', ' ', re.sub('\n+', ' ', re.sub('\r+', ' ', i))).strip().lower() for i in titles]


display(titles[0])
display(meta_headers[0])
display(body[0])
  • AI Chat
  • Code