Workspace
Carlo D'Aloia/

Course Notes: Web Scraping in Python

0
Beta
Spinner

Web scraping in Python

from scrapy import Selector
import requests

Introduction to HTML

# Write and run code here
html = '''
<html>
  <head>
    <title>Intro HTML</title>
  </head>
  <body>
    <p>Hello World!</p>
    <p>Enjoy Datacamp!</p>
  </body>
</html>
'''
xpath = '/html/body/div[2]/p'
xpath2 = '//span[@class="span-class"]'

Xpaths and selectors

# Create an XPath string to direct to children of body element
xpath = '/html/body/*'

# Print out the number of elements selected
how_many_elements( xpath )
<html>
  <body>
    <div>
      <p>Hello World!</p>
      <div>
        <p>Choose DataCamp!</p>
      </div>
    </div>
    <div>
      <p>Thanks for Watching!</p>
    </div>
  </body>
</html>

# Create an XPath string to the desired paragraph element
xpath = '/html/body/div/div/p'

# Print out the element text
print_element_text( xpath )
# Create an Xpath string to select desired p element
xpath = '//*[@id="div3"]/p'

# Print out selection text
print_element_text(xpath)
# Create an XPath string to select p element by class
xpath = '//p[@class="class-1 class-2"]'

# Print out select text
print_element_text( xpath )
# Create an xpath to the href attribute
xpath = '//p[@id="p2"]/a/@href'

# Print out the selection(s); there should be only one
print_attribute( xpath )
# Create an xpath to the href attributes
xpath = '//a[contains(@href,"package-snippet")]/@href'

# Print out how many elements are selected
how_many_elements( xpath )
# Preview the selected elements
preview( xpath )
sel.xpath( '//div' ).xpath( './span/p[3]' ) 
# Create a Selector selecting html as the HTML document
sel = Selector( text=html )

# Create a SelectorList of all div elements in the HTML document
divs = sel.xpath( "//div" )

# Create the string html containing the HTML source
html = requests.get( url ).content

# Create the Selector object sel from html
sel = Selector( text = html )

# Print out the number of elements in the HTML document
print( "There are 1020 elements in the HTML document.")
print( "You have found: ", len( sel.xpath('//*') ) )

CSS Locators

# Create the XPath string equivalent to the CSS Locator 
xpath = '/html/body/span[1]//a'

# Create the CSS Locator string equivalent to the XPath
css_locator = 'html > body > span:nth-of-type(1) a'
# Create the XPath string equivalent to the CSS Locator 
xpath = '//div[@id="uid"]/span//h4'

# Create the CSS Locator string equivalent to the XPath
css_locator = 'div#uid > span h4'
# Create a selector from the html (of a secret website)
sel = Selector( text = html )

# Fill in the blank
css_locator = "div.course-block > a"

# Print the number of selected elements.
how_many_elements( css_locator )
# Create the CSS Locator to all children of the element whose id is uid
css_locator = "#uid > *"
# Create a selector object from a secret website
sel = Selector( text = html )

# Select all hyperlinks of div elements belonging to class "course-block"
course_as = sel.css( 'div.course-block > a' )

# Selecting all href attributes chaining with css
hrefs_from_css = course_as.css( '::attr(href)' )

# Selecting all href attributes chaining with xpath
hrefs_from_xpath = course_as.xpath( './@href' )
# Create an XPath string to the desired text.
xpath = '//p[@id="p3"]/text()'

# Create a CSS Locator string to the desired text.
css_locator = 'p#p3::text'

# Print the text from our selections
print_results( xpath, css_locator )
# Create an XPath string to the desired text.
xpath = '//p[@id="p3"]//text()'

# Create a CSS Locator string to the desired text.
css_locator = 'p#p3 ::text'

# Print the text from our selections
print_results( xpath, css_locator )