Author: croberts
Date: 2011-09-16 19:49:56 +0000 (Fri, 16 Sep 2011)
New Revision: 4978
Modified:
trunk/wooly/python/wooly/util.py
Log:
Adding unescape_entity utility method.
Modified: trunk/wooly/python/wooly/util.py
===================================================================
--- trunk/wooly/python/wooly/util.py 2011-09-16 18:43:07 UTC (rev 4977)
+++ trunk/wooly/python/wooly/util.py 2011-09-16 19:49:56 UTC (rev 4978)
@@ -1,4 +1,5 @@
import htmlentitydefs as entity
+import re
import logging
import math
import os
@@ -42,6 +43,35 @@
t += i
return t
+##
+# Removes HTML or XML character references and entities from a text string.
+#
+# @param text The HTML (or XML) source text.
+# @return The plain text, as a Unicode string, if necessary.
+# from Fredrik Lundh
+#
http://effbot.org/zone/re-sub.htm#unescape-html
+##
+def unescape_entity(text):
+ def fixup(m):
+ text = m.group(0)
+ if text[:2] == "&#":
+ # character reference
+ try:
+ if text[:3] == "&#x":
+ return unichr(int(text[3:-1], 16))
+ else:
+ return unichr(int(text[2:-1]))
+ except ValueError:
+ pass
+ else:
+ # named entity
+ try:
+ text = unichr(entity.name2codepoint[text[1:-1]])
+ except KeyError:
+ pass
+ return text # leave as is
+ return re.sub("&#?\w+;", fixup, text)
+
class Writer(object):
def __init__(self):
self.writer = StringIO()