Get The Html Under A Tag Using Htmlparser Python
I want to get whole html under a tag and using HTMLParser. I am able to currently get the data between the tags and following is my code class LinksParser(HTMLParser): def __init
Solution 1:
One could use xml.etree.ElementTree.TreeBuilder
to exploit etree API for finding/manipulating the <span>
element:
import sys
from HTMLParser import HTMLParser
from xml.etree import cElementTree as etree
classLinksParser(HTMLParser):
def__init__(self):
HTMLParser.__init__(self)
self.tb = etree.TreeBuilder()
defhandle_starttag(self, tag, attributes):
self.tb.start(tag, dict(attributes))
defhandle_endtag(self, tag):
self.tb.end(tag)
defhandle_data(self, data):
self.tb.data(data)
defclose(self):
HTMLParser.close(self)
return self.tb.close()
parser = LinksParser()
parser.feed(sys.stdin.read())
root = parser.close()
span = root.find(".//span[@itemprop='description']")
etree.ElementTree(span).write(sys.stdout)
Output
<spanitemprop="description"><h1>My First Heading</h1><p>My first <br /><br />paragraph.</p></span>
To print without the parent (root) <span>
tag:
sys.stdout.write(span.text)
for child in span:
sys.stdout.write(etree.tostring(child)) # add encoding="unicode" on Python 3
Solution 2:
Here's something that gets the job done based on the test data you provided with minimal changes to your existing code (assuming it's basically doing what you want already). You'd probably want to expand it to deal with self-closing tags in a more robust way:
from HTMLParser import HTMLParser
classLinksParser(HTMLParser):
def__init__(self):
HTMLParser.__init__(self)
self.recording = 0
self.data = ''
self.self_closing_tags = ("br",)
defhandle_starttag(self, tag, attributes):
if tag notin ('span',) + self.self_closing_tags:
self.data += "<%s" % (tag,)
if attributes:
self.data += " " + " ".join('%s="%s"' % (k, v) for k, v in attributes)
self.data += ">"returnif self.recording:
self.recording += 1returnfor name, value in attributes:
if name == 'itemprop'and value == 'description':
breakelse:
return
self.recording = 1defhandle_endtag(self, tag):
if tag == 'span'and self.recording:
self.recording -= 1elif tag in self.self_closing_tags:
self.data += "<%s/"> % (tag,)
else:
self.data += "</%s>" % (tag,)
defhandle_data(self, data):
if self.recording:
self.data += data
Given this as input:
<spanitemprop="description"><h1>My First Heading</h1><p>My first <br/><br/>paragraph.</p></span>
the output is:
<h1>My First Heading</h1><p>My first <br/><br/>paragraph.</p>
Post a Comment for "Get The Html Under A Tag Using Htmlparser Python"