# htmltext.py # An HTML parser that captures *only* the text of the HTML. # # Author: Aseem Kishore # # No license -- free to use. Just give credit where it's due please. =) from sgmllib import SGMLParser from string import * class HtmlTextParser(SGMLParser): """ An HTML parser that captures *only* the text of the HTML. """ ## Constructor ## def __init__(self): """ Initialize a new HtmlTextParser. """ SGMLParser.__init__(self) self.text_list = [] self.inside_unwanted_tags = 0 # how many unwanted tags we're inside ## Public methods ## def parse(self, html): """ Parse the given html. """ assert type(html) is str,\ "html must be a string, " + str(type(html)) + " given." self.feed(html) self.close() def get_text(self): """ Return the text that has been parsed so far. """ return join(self.text_list) ## Overridden methods ## def start_script(self, attributes): """ Process a