From ce80c8f65597d4b965e5d3ae62053ba6c45691e8 Mon Sep 17 00:00:00 2001 From: pkholkin Date: Fri, 16 May 2014 16:33:52 +0400 Subject: [PATCH] Escaped html bad symbols in company names closes bug 1319873 Change-Id: Idb056d5e74bc6642e788c5abadcdde6a59f9048b --- stackalytics/processor/mps.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/stackalytics/processor/mps.py b/stackalytics/processor/mps.py index 41cde1f3e..219b85840 100644 --- a/stackalytics/processor/mps.py +++ b/stackalytics/processor/mps.py @@ -12,6 +12,7 @@ # implied. # See the License for the specific language governing permissions and # limitations under the License. + import re import time @@ -41,7 +42,7 @@ def _convert_str_fields_to_unicode(result): pass -def _retrieve_member(uri, member_id): +def _retrieve_member(uri, member_id, html_parser): content = utils.read_uri(uri) @@ -63,7 +64,7 @@ def _retrieve_member(uri, member_id): for rec in re.finditer(COMPANY_PATTERN, content): result = rec.groupdict() - member['company_draft'] = result['company_draft'] + member['company_draft'] = html_parser.unescape(result['company_draft']) return member @@ -87,11 +88,12 @@ def log(uri, runtime_storage_inst, days_to_update_members): cnt_empty = 0 cur_index = last_member_index + 1 + html_parser = six.moves.html_parser.HTMLParser() while cnt_empty < CNT_EMPTY_MEMBERS: profile_uri = uri + str(cur_index) - member = _retrieve_member(profile_uri, str(cur_index)) + member = _retrieve_member(profile_uri, str(cur_index), html_parser) if 'member_name' not in member: cnt_empty += 1