Skip to content
First NLP Project
  • AI Chat
  • Code
  • Report
  • Spinner
    # Start coding here... 
    import numpy as np
    import pandas as pd
    import requests
    import re
    from bs4 import BeautifulSoup
    base_url = "http://www.uh.edu/engines/epi"
    web_data = []
    
    for i in range(100):
        url = base_url + str(i+1) + ".htm"
        soup = BeautifulSoup(requests.get(url).text)
        web_data.append(soup)
        
    display(web_data[0])
    demo_data = web_data[0]
    title = demo_data.title.text
    meta_list = demo
    display(title)
    display(demo_data.head.find_all('meta'))
    titles = []
    authors = []
    meta_headers = []
    body = []
    for i in range(len(web_data)):
        titles.append(web_data[i].title.text)
        meta_headers.append(web_data[i].head.find_all('meta'))
        body.append(web_data[i].body.text)
    
    titles = [re.sub(' +', ' ', re.sub('\n+', ' ', re.sub('\r+', ' ', i))).strip().lower() for i in titles]
    
    
    display(titles[0])
    display(meta_headers[0])
    display(body[0])