# htmltext.py
# An HTML parser that captures *only* the text of the HTML.
#
# Author: Aseem Kishore
#
# No license -- free to use. Just give credit where it's due please. =)
from sgmllib import SGMLParser
from string import *
class HtmlTextParser(SGMLParser):
""" An HTML parser that captures *only* the text of the HTML. """
## Constructor ##
def __init__(self):
""" Initialize a new HtmlTextParser. """
SGMLParser.__init__(self)
self.text_list = []
self.inside_unwanted_tags = 0 # how many unwanted tags we're inside
## Public methods ##
def parse(self, html):
""" Parse the given html. """
assert type(html) is str,\
"html must be a string, " + str(type(html)) + " given."
self.feed(html)
self.close()
def get_text(self):
""" Return the text that has been parsed so far. """
return join(self.text_list)
## Overridden methods ##
def start_script(self, attributes):
""" Process a