Beta
Table of Contents
The outline of your notebook will show up here. You can include headings in any text cell by starting a line with #
, ##
, ###
, etc., depending on the desired title hierarchy.
Web scraping in Python
from scrapy import Selector
import requests
Introduction to HTML
# Write and run code here
html = '''
<html>
<head>
<title>Intro HTML</title>
</head>
<body>
<p>Hello World!</p>
<p>Enjoy Datacamp!</p>
</body>
</html>
'''
xpath = '/html/body/div[2]/p'
xpath2 = '//span[@class="span-class"]'
Xpaths and selectors
# Create an XPath string to direct to children of body element
xpath = '/html/body/*'
# Print out the number of elements selected
how_many_elements( xpath )
<html>
<body>
<div>
<p>Hello World!</p>
<div>
<p>Choose DataCamp!</p>
</div>
</div>
<div>
<p>Thanks for Watching!</p>
</div>
</body>
</html>
# Create an XPath string to the desired paragraph element
xpath = '/html/body/div/div/p'
# Print out the element text
print_element_text( xpath )
# Create an Xpath string to select desired p element
xpath = '//*[@id="div3"]/p'
# Print out selection text
print_element_text(xpath)
# Create an XPath string to select p element by class
xpath = '//p[@class="class-1 class-2"]'
# Print out select text
print_element_text( xpath )
# Create an xpath to the href attribute
xpath = '//p[@id="p2"]/a/@href'
# Print out the selection(s); there should be only one
print_attribute( xpath )
# Create an xpath to the href attributes
xpath = '//a[contains(@href,"package-snippet")]/@href'
# Print out how many elements are selected
how_many_elements( xpath )
# Preview the selected elements
preview( xpath )
sel.xpath( '//div' ).xpath( './span/p[3]' )
# Create a Selector selecting html as the HTML document
sel = Selector( text=html )
# Create a SelectorList of all div elements in the HTML document
divs = sel.xpath( "//div" )
# Create the string html containing the HTML source
html = requests.get( url ).content
# Create the Selector object sel from html
sel = Selector( text = html )
# Print out the number of elements in the HTML document
print( "There are 1020 elements in the HTML document.")
print( "You have found: ", len( sel.xpath('//*') ) )
CSS Locators
# Create the XPath string equivalent to the CSS Locator
xpath = '/html/body/span[1]//a'
# Create the CSS Locator string equivalent to the XPath
css_locator = 'html > body > span:nth-of-type(1) a'
# Create the XPath string equivalent to the CSS Locator
xpath = '//div[@id="uid"]/span//h4'
# Create the CSS Locator string equivalent to the XPath
css_locator = 'div#uid > span h4'
# Create a selector from the html (of a secret website)
sel = Selector( text = html )
# Fill in the blank
css_locator = "div.course-block > a"
# Print the number of selected elements.
how_many_elements( css_locator )
# Create the CSS Locator to all children of the element whose id is uid
css_locator = "#uid > *"
# Create a selector object from a secret website
sel = Selector( text = html )
# Select all hyperlinks of div elements belonging to class "course-block"
course_as = sel.css( 'div.course-block > a' )
# Selecting all href attributes chaining with css
hrefs_from_css = course_as.css( '::attr(href)' )
# Selecting all href attributes chaining with xpath
hrefs_from_xpath = course_as.xpath( './@href' )
# Create an XPath string to the desired text.
xpath = '//p[@id="p3"]/text()'
# Create a CSS Locator string to the desired text.
css_locator = 'p#p3::text'
# Print the text from our selections
print_results( xpath, css_locator )
# Create an XPath string to the desired text.
xpath = '//p[@id="p3"]//text()'
# Create a CSS Locator string to the desired text.
css_locator = 'p#p3 ::text'
# Print the text from our selections
print_results( xpath, css_locator )