- 博客(0)
- 资源 (3)
空空如也
python 最牛的解析html的方法
class MyHTMLParser(HTMLParser.HTMLParser):
def __init__(self):
self.selected = ['html', 'body', 'table', 'head', 'meta', 'style', 'tr', 'td']
self.reset()
self._level_stack = []
self.flag = True
global style_list, br_list, tr_list, td_list, td_class_list
style_list, br_list, tr_list, td_list, td_class_list = [], [], [], [], []
def handle_starttag(self, tag, attrs):
if tag in self.selected:
self._level_stack.append(tag)
if tag == 'td':
if len(attrs)==0:
if self.flag:
td_class_list.append('')
else:
for(varviable,value) in attrs:
if self.flag:
td_class_list.append(value)
elif tag == 'tr':
if len(attrs) != 0:
for(varviable,value) in attrs:
if varviable == 'class' and value == 'SectionGap':
self.flag = False
else:
self.flag = True
tr_list.append(value)
def handle_endtag(self, tag):
if self._level_stack and tag in self.selected and tag == self._level_stack[-1]:
self._level_stack.pop()
def handle_data(self, data):
for key in codemap.keys():
data = data.replace(codemap[key], key)
if '/'.join(self._level_stack) == 'html/head/meta/body/table/tr/td' and self.flag:
td_list.append(data)
elif '/'.join(self._level_stack) == 'html/head/meta/body':
br_list.append(data)
elif '/'.join(self._level_stack) == 'html/head/meta/style':
style_list.append(data)
2011-10-31
空空如也
TA创建的收藏夹 TA关注的收藏夹
TA关注的人