Escaped html bad symbols in company names

closes bug 1319873

Change-Id: Idb056d5e74bc6642e788c5abadcdde6a59f9048b
This commit is contained in:
pkholkin 2014-05-16 16:33:52 +04:00
parent ca37098d93
commit ce80c8f655

View File

@ -12,6 +12,7 @@
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import time
@ -41,7 +42,7 @@ def _convert_str_fields_to_unicode(result):
pass
def _retrieve_member(uri, member_id):
def _retrieve_member(uri, member_id, html_parser):
content = utils.read_uri(uri)
@ -63,7 +64,7 @@ def _retrieve_member(uri, member_id):
for rec in re.finditer(COMPANY_PATTERN, content):
result = rec.groupdict()
member['company_draft'] = result['company_draft']
member['company_draft'] = html_parser.unescape(result['company_draft'])
return member
@ -87,11 +88,12 @@ def log(uri, runtime_storage_inst, days_to_update_members):
cnt_empty = 0
cur_index = last_member_index + 1
html_parser = six.moves.html_parser.HTMLParser()
while cnt_empty < CNT_EMPTY_MEMBERS:
profile_uri = uri + str(cur_index)
member = _retrieve_member(profile_uri, str(cur_index))
member = _retrieve_member(profile_uri, str(cur_index), html_parser)
if 'member_name' not in member:
cnt_empty += 1