import urllib.request # abilty to connect to web server & get HTML 
import bs4 # parses the HTML


url_to_read = "https://ada.ius.edu/~cjkimmer/teaching/i427.html"
response = urllib.request.urlopen(url_to_read) # request
html_document = response.read() # read response from server
response.close()


print(html_document)

b'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n <head>\n  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n  <title>\n   Christopher J. Kimmer\n  </title>\n  <link href="../css/style.css" rel="stylesheet" type="text/css" />\n </head>\n <body>\n  <div id="wrapper">\n   <div id="content">\n    <div id="mainimg">\n     <div id="logo">\n      <h1>\n       Christopher J. Kimmer, Ph.D.\n      </h1>\n      <h2>\n       IU Southeast Informatics\n      </h2>\n     </div>\n     <h3>\n      iSciCL\n     </h3>\n     <h4>\n      Informatics and Scientific Computing Laboratory\n     </h4>\n    </div>\n    <div id="navheader">\n     <ul>\n      <li style="border-left: 0px">\n       <a href="../index.html">\n        Home\n       </a>\n      </li>\n      <li id="focused"> \n       <a href="./teaching.html">\n        Teaching\n       </a>\n      </li>\n      <li>\n       <a href="../research/research.html">\n        Research\n       </a>\n      </li>\n      <li>\n       <a href="../calendar.html">\n        Calendar\n       </a>\n      </li>\n      <li>\n       <a href="../links.html">\n        Links\n       </a>\n      </li>\n      <li style="width:124px;">\n       <a href="../contact.html">\n        Contact\n       </a>\n      </li>\n     </ul>\n    </div>\n    <div id="contentarea">\n     <div id="leftbar">\n      <!-- lemon tree -->\n      <h2>I427 Tentative Class Schedule \n      </h2>\n      <h3> Spring 2014</h3>\n      <p><em>\n       Tentative means that this schedule may change due to weather, classroom pace, etc.</em>\n      </p>\n      <br />\n      <table>\n\t<tr>\n\t  <td>Date\n\t  </td>\n\t  <td>\n\t\tClass Topic\n\t  <br/>\n\t\tReading, Assignments, etc.\n\t  </td>\n\t</tr>\n\t<tr class="day1">\n\t  <td>M 1/13\n\t  </td>\n\t  <td>Syllabus, Information Retrieval, Intro to Python Interpreter\n\t  </td>\n\t</tr>\n\t<tr class="day2">\n\t  <td>W 1/15\n\t  </td>\n\t  <td>Python Lists\n\t  </td>\n\t</tr>\n\t<tr class="day1">\n\t  <td>M 1/20\n\t  </td>\n\t  <td> NO CLASS - MLK DAY HOLIDAY\n\t  </td>\n\t  <td>\n\t  </td>\n\t</tr>\n\t<tr class="day2">\n\t  <td>W 1/22\n\t  </td>\n\t  <td>Python Dictionaries\n\t  </td>\n\t</tr>\n\t<tr class="day1">\n\t  <td>M 1/27\n\t  </td>\n\t  <td>Serialization, List Comprehensions - HW 1 due\n\t  </td>\n\t</tr>\n\t<tr class="day2">\n\t  <td>W 1/29\n\t  </td>\n\t  <td>IPython Setup, MySQLdb \n\t  </td>\n\t</tr>\n\t<tr class="day1">\n\t  <td>M 2/3\n\t  </td>\n\t  <td>No Class - Kimmer Sick\n\t  </td>\n\t</tr>\n\t<tr class="day2">\n\t  <td>W 2/5\n\t  </td>\n\t  <td>Twitter API Introduction \n\t  </td>\n\t</tr>\n\t<tr class="day1">\n\t  <td>M 2/10\n\t  </td>\n\t  <td>Twitter API - What can we get? - HW 2 due\n\t  </td>\n\t</tr>\n\t<tr class="day2">\n\t  <td>W 2/12\n\t  </td>\n\t  <td>In Class: Twitter wrapper Class - HW 3 due\n\t  </td>\n\t</tr>\n\t<tr class="day1">\n\t  <td>M 2/17\n\t  </td>\n\t  <td>TwittIR - Term-Document Incidence &amp; Inverted Indices\n\t    <br/>\n\t    In Class 1 Hand-in\n\t  </td>\n\t</tr>\n\t<tr class="day2">\n\t  <td>W 2/19\n\t  </td>\n\t  <td>In Class: Inverted Index \n\t  </td>\n\t</tr>\n\t<tr class="day1">\n\t  <td>M 2/24\n\t  </td>\n\t  <td>Ranking &amp; Stemming Tweets\n\t    <br/>\n\t    In Class 2 Hand-in\n\t  </td>\n\t</tr>\n\t<tr class="day2">\n\t  <td>W 2/26\n\t  </td>\n\t  <td>How similar are two Twitter Users?\n\t  </td>\n\t</tr>\n\t<tr class="day1">\n\t  <td>M 3/3\n\t  </td>\n\t  <td>...continued.\n\t  </td>\n\t</tr>\n\t<tr class="day2">\n\t  <td>W 3/5\n\t  </td>\n\t  <td>Scraping Web Pages\n\t    <em>Read Chapter 5 in Mining the Social Web</em>\n\t    <strong>Project 1 Due</strong>\n\t  </td>\n\t</tr>\n\t<tr class="day1">\n\t  <td>M 3/10\n\t  </td>\n\t  <td>Storing Web Pages in a Database\n\t  </td>\n\t</tr>\n\t<tr class="day2">\n\t  <td>W 3/12\n\t  </td>\n\t  <td>In Class Scraping\n\t  </td>\n\t</tr>\n\t<tr class="day1">\n\t  <td>M 3/17\n\t  </td>\n\t  <td>Crawling the Web - BFS\n\t  </td>\n\t</tr>\t\n\t<tr class="day2">\n\t  <td>W 3/19\n\t  </td>\n\t  <td>Crawling the Web - Real-World Issues\n\t  </td>\n\t</tr>\n\t<tr class="day1">\n\t  <td>M 3/31\n\t  </td>\n\t  <td>Scoring &amp; Ranking Documents - tf, df, idf, tf-idf, and d\n\t  </td>\n\t</tr>\t \n\t<tr class="day2">\n\t  <td>W 4/2\n\t  </td>\n\t  <td>\n\t    The Vector Space Model\n\t    <br/>\n\t    <strong>HW 5 due Fri. 4/4 at 11 AM</strong>\n\t  </td>\n\t</tr>\n\t<tr class="day1">\n\t  <td>M 4/7\n\t  </td>\n\t  <td>Test 1 - Python Practical\n\t  </td>\n\t</tr>\n\t<tr class="day2">\n\t  <td>W 4/9\n\t  </td>\n\t  <td>Biwords and query processing\n\t  </td>\n\t</tr>\n\t<tr class="day1">\n\t  <td>M 4/14\n      </td>\n      <td>Implementation Details and Catch-up, Simple Voting\n\t  </td>\n\t</tr>\n\t<tr class="day2">\n\t  <td>W 4/16\n\t  </td>\n\t  <td>HITS <br/>\n\t    <strong>Project 2 due 11 AM</strong>\n\t  </td>\n\t</tr>\n\t<tr class="day1">\n\t  <td>M 4/21\n\t  </td><td>\nPage Rank Part 1\n\t  </td>\n\t</tr>\n\t<tr class="day2">\n\t  <td>W 4/23\n\t  </td>\n\t  <td>Finish Up Page Rank <br/>\n\t    <strong>HW 6 due 11 AM</strong>\n\t  </td>\n\t</tr>\n\t<tr class="day1">\n\t  <td>M 4/28\n\t  </td>\n\t  <td>Search Engine Presentations <br/>\n\t    <strong>HW 7 due 11 AM</strong>\n\t  </td>\n\t</tr>\n\t<tr class="day2">\n\t  <td>M 5/5\n\t  </td>\n\t  <td>11 AM Test 2 <br/>\n\t    <strong>Project 3 Due</strong>\n\t </td>\n\t</tr>\n      </table>\n      <br />\n      <!-- lemon tree -->\n     </div>\n     <div id="rightbar">\n      <h2 class="colortext">\n       Courses\n      </h2>\n      <p>\n       <a href="i101.html">\n        I101: Introduction to Informatics\n       </a>\n      </p>\n      <p>\n       <a href="i110.html">\n        I110: Programming Concepts\n       </a>\n      </p>\n      <p>\n       <a href="i111.html">\n        I111: Introduction to Databases\n       </a>\n      </p>\n      <p>\n       <a href="i300.html">\n        I300: Human Computer Interaction\n       </a>\n      </p>\n      <p>\n       <a href="i308.html">\n        I308: Information Representation\n       </a>\n      </p>\n      <p>\n       <a href="i427.html">\n        I427: Search Informatics\n       </a>\n      </p>\n      <p>\n       <a href="i441.html">\n        I441: Interaction Design Practice\n       </a>\n      </p>\n      <p>\n       <a href="i494.html">\n        I494: Capstone I\n       </a>\n      </p>\n      <p>\n       <a href="i495.html">\n        I495: Capstone II\n       </a>\n      </p>\n     </div>\n    </div>\n    <div id="bottom">\n     <div id="email">\n      <div id="emailtext">\n       cjkimmer -at- ius.edu\n      </div>\n     </div>\n     <div id="validtext">\n      <p>\n       Valid\n       <a href="http://validator.w3.org/check?uri=referer">XHTML</a>\n       |\n       <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a>\n      </p>\n     </div>\n    </div>\n   </div>\n  </div>\n </body>\n</html>\n'


type(html_document)

bytes


# now it's a real string
html_document.decode('utf-8')[:256] # just show the first few characters. we don't need all the output

'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n <head>\n  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n  <title>\n'


# call the str() constructor instead, specifying the character set
str(html_document,'utf-8')[:256]

'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n <head>\n  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n  <title>\n'


soup = bs4.BeautifulSoup(html_document, "lxml")


text_content = soup.get_text()
print(text_content) # this is something that we can tokenize & normalize before adding to index



   Christopher J. Kimmer
  

       Christopher J. Kimmer, Ph.D.
      

       IU Southeast Informatics
      

      iSciCL
     

      Informatics and Scientific Computing Laboratory
     

        Home
       

        Teaching
       

        Research
       

        Calendar
       

        Links
       

        Contact
       

I427 Tentative Class Schedule 
      
 Spring 2014

       Tentative means that this schedule may change due to weather, classroom pace, etc.


Date
	  

		Class Topic
	  
		Reading, Assignments, etc.
	  

M 1/13
	  
Syllabus, Information Retrieval, Intro to Python Interpreter
	  

W 1/15
	  
Python Lists
	  

M 1/20
	  
 NO CLASS - MLK DAY HOLIDAY
	  

W 1/22
	  
Python Dictionaries
	  

M 1/27
	  
Serialization, List Comprehensions - HW 1 due
	  

W 1/29
	  
IPython Setup, MySQLdb 
	  

M 2/3
	  
No Class - Kimmer Sick
	  

W 2/5
	  
Twitter API Introduction 
	  

M 2/10
	  
Twitter API - What can we get? - HW 2 due
	  

W 2/12
	  
In Class: Twitter wrapper Class - HW 3 due
	  

M 2/17
	  
TwittIR - Term-Document Incidence & Inverted Indices
	    
	    In Class 1 Hand-in
	  

W 2/19
	  
In Class: Inverted Index 
	  

M 2/24
	  
Ranking & Stemming Tweets
	    
	    In Class 2 Hand-in
	  

W 2/26
	  
How similar are two Twitter Users?
	  

M 3/3
	  
...continued.
	  

W 3/5
	  
Scraping Web Pages
	    Read Chapter 5 in Mining the Social Web
Project 1 Due


M 3/10
	  
Storing Web Pages in a Database
	  

W 3/12
	  
In Class Scraping
	  

M 3/17
	  
Crawling the Web - BFS
	  

W 3/19
	  
Crawling the Web - Real-World Issues
	  

M 3/31
	  
Scoring & Ranking Documents - tf, df, idf, tf-idf, and d
	  

W 4/2
	  

	    The Vector Space Model
	    
HW 5 due Fri. 4/4 at 11 AM


M 4/7
	  
Test 1 - Python Practical
	  

W 4/9
	  
Biwords and query processing
	  

M 4/14
      
Implementation Details and Catch-up, Simple Voting
	  

W 4/16
	  
HITS 
Project 2 due 11 AM


M 4/21
	  
Page Rank Part 1
	  

W 4/23
	  
Finish Up Page Rank 
HW 6 due 11 AM


M 4/28
	  
Search Engine Presentations 
HW 7 due 11 AM


M 5/5
	  
11 AM Test 2 
Project 3 Due


       Courses
      

        I101: Introduction to Informatics
       

        I110: Programming Concepts
       

        I111: Introduction to Databases
       

        I300: Human Computer Interaction
       

        I308: Information Representation
       

        I427: Search Informatics
       

        I441: Interaction Design Practice
       

        I494: Capstone I
       

        I495: Capstone II
       

       cjkimmer -at- ius.edu
      

       Valid
       XHTML
       |
       CSS

Importing Python modules¶

Reading a webpage given its URL¶

Byte strings¶

Details of the import statement¶

Removing markup to get text content¶

Your task for today / Quiz Question¶

From one webpage to a corpus¶