Gerardo Nicolas Vietri














Sign up
Beta
Spinner
# Import beautifulsoup

!pip install beautifulsoup4

# Import requested libraries
import numpy as np
import pandas as pd # library for data analysis
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML documents


# get the response in the form of html
wikiurl="https://en.wikipedia.org/wiki/List_of_cities_in_Ukraine#Table_of_cities"
table_class="wikitable sortable jquery-tablesorter"
response=requests.get(wikiurl)
print(response.status_code)

Request for the HTML response using the URL : We send a GET request to the Wikipedia URL whose table needs to be scraped and store the HTML response in a variable. It is not legal to scrape any website, so we check the status code. 200 shows that we can go ahead and download it.

do dendogram

# UNSUPERVISED LEARNING IN PYTHONHierarchical clustering with SciPy
# Given samples (the array of scores), and country_names
# import matplotlib.pyplot as plt
# from scipy.cluster.hierarchy import linkage, dendrogram
# mergings = linkage(samples, method='complete')
# dendrogram(mergings,
# labels=country_names,
# leaf_rotation=90,
# leaf_font_size=6)
# plt.show()
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(response.text, 'html.parser')
uatable=soup.find('table',{'class':"wikitable"})
df=pd.read_html(str(uatable))
print(type(df))
print(df[0])
# convert list to dataframe
df=pd.DataFrame(df[0])
print(df.head())
print(type(df))

# replace_values = {'[a]':'','[b]':'','[c]':'','[d]':'',' ':'_'}
df['City name']=df['City name'].str.replace(' ','_')
#df['City name']=df['City name'].str.replace("[a]","") it is not replacing characters on the series?
cities=df["City name"].tolist()





# print(len(cities[4]))
cities=[cities[i].replace("[a]","") for i in range(len(cities))]
cities=[cities[i].replace("[b]","") for i in range(len(cities))]
cities=[cities[i].replace("[c]","") for i in range(len(cities))]
cities=[cities[i].replace("[d]","") for i in range(len(cities))]
cities[0:10]
df.shape
df.info()
print(df[df['Oblast']=='Ivano-Frankivsk'])

https://www.projectpro.io/article/sql-database-projects-for-data-analysis-to-practice/565

from bs4 import BeautifulSoup
import requests
wiki_url = 'https://en.wikipedia.org/wiki/List_of_cities_in_Ukraine#List_of_cities'
print('Fetching main wiki article: %s' % wiki_url)
page = requests.get(wiki_url).text
print('Done. Extracting table links..')
html = BeautifulSoup(page, 'html.parser')
table = html.find('table', 'wikitable')



links = table.findAll('a')
links_content = {}
list_of_links= []
# for link in links:
#   print(link)  

for name in cities:
     #testing
     #print('https://en.wikipedia.org/wiki/'+link.string)
     #code to update dict 
  links_content.update({name:'https://en.wikipedia.org/wiki/'+name})
  list_of_links.append('https://en.wikipedia.org/wiki/'+name)  
print(list_of_links)
list_of_links_test = list_of_links[:10]
city_data = []
list_dataframes = []
for link in list_of_links_test:
  #print(link)
  response1=requests.get(link)
  # parse data from the html into a beautifulsoup object
  soup = BeautifulSoup(response1.text, 'html.parser')
  citytable=soup.find('table',{'class':"infobox ib-settlement vcard"})
  cities=pd.read_html(str(citytable))
  city_data.append(cities[0])

# # Transpose each DataFrame
# transposed_dfs = [df.transpose() for df in city_data]

for df in city_data:
    # Extract the header row
    header_row = df.iloc[0]
    # Reset the columns of the DataFrame to ensure the same number of elements as the header row
    df.columns = range(len(df.columns))
    # Remove the header row from the DataFrame
    df = df.iloc[1:]
    # Transpose the DataFrame
    transposed_df = df.transpose()
    # Set the header row as the column names of the transposed DataFrame
    transposed_df.columns = header_row
    # Reset the index
    transposed_df.reset_index(drop=True, inplace=True)
    # Append the transposed DataFrame to the list
    transposed_dfs.append(transposed_df)



transposed_dfs[1]    

# Now combined_df contains a single DataFrame with all the data from the individual city DataFrames


  # before appending need to delete first 7 rows, standarize and fix number of columns  , then transpose, then stack dataframes
    
  # here we have a list of dataframes where each element should be a dataframe  
# for i in range(len(city_data)):
#     city_data[i].columns = city_data[i].iloc[0]
#     city_data[i] = city_data.iloc[1:]

# #Step 1: identify common columns
# common_columns = set(list_dataframes[0].columns)
# for df in list_dataframes[1:]:
#     common_columns &= set(df.columns)
# common_columns
    
    
    
    
#convert list(s) to dataframe
# testdf=pd.concat(city_data, axis=1).iloc[7:]
# testdf.reset_index(drop=True, inplace=True)
# testdf
# transpose=testdf.transpose()
# transpose.columns=transpose.iloc[0]
# #remove first row from DataFrame
# transpose = transpose[1:]
# transpose
# print(len(city_data))
# ukraine=pd.DataFrame(city_data)
# ukraine.transpose()
# data1=city_data[0].iloc[7:].transpose()
# data1.columns=data1.iloc[0]
# data1=data1[1:]
# data1.reset_index(inplace=True)
# data1 = data1.rename(columns = {'index':'City'})
# data2=city_data[1].transpose()

# #just get the coordinates
# data1.iloc[:,1]=data1.iloc[:,1].str[-22:]

# #print(data1.iloc[:,1])
# # data1.columns
# # testdf=data1
# # data1.columns
# new_colums=['City',
#        'Coordinates',
#        'Country', 'Municipality', 'Founded', 'Named for', 'City council',
#        'Raions', 'Government', 'Mayor',
#        'Area', 'City Area', 'Elevation',
#        'Population',
#        'Population_census', 'Rank', 'Density',
#        'Metro', 'Demonym(s)', 'Gross Regional Product', 'PBI',
#        'Per capita', 'Time zone', 'Summer (DST)', 'Postal code',
#        'Area code', 'Vehicle registration plate', 'FIPS code', 'Website']
# #rename columns manually
# print(data1.columns)
# #drop columns
# #data1.drop(data1.columns[[2,3,5,6,8,10,13,19]],axis=1, inplace=True)
# data1

ok so i got the data we wanted from wikipedia but it is really untidy data, the dataframe looks like a diagonal matrix [10000 01000 00100 00010 00001]

make it tidy

  1. each column is a variable
  2. each row is an observation, each row in this case is a city, thats clear

so we need to define the variables and fix the lenght of it, how many columns, ok i think for this we can do it simple and then add more features

cities table id name name_in_latin_characters city village Oblast_region year_founded population area distance_to_capital(coordinates)

Run cancelled
new_df=pd.DataFrame.from_dict(links_content, orient='index')
new_df



  • AI Chat
  • Code