diff --git a/etc/default_data.json b/etc/default_data.json index a54bc42a7..796f9a0ed 100644 --- a/etc/default_data.json +++ b/etc/default_data.json @@ -6767,6 +6767,8 @@ } ], "mail_lists": ["http://lists.openstack.org/pipermail/openstack-dev/"], + "member_lists": ["http://www.openstack.org/community/members/profile/"], + "voting_date": "2014-Jan-01", "project_types": [ { "id": "all", diff --git a/etc/default_data.schema.json b/etc/default_data.schema.json index 03692f2ca..596b0e6a6 100644 --- a/etc/default_data.schema.json +++ b/etc/default_data.schema.json @@ -181,6 +181,15 @@ "type": "string" } }, + "member_lists": { + "type": "array", + "items": { + "type": "string" + } + }, + "voting_date": { + "type": "string" + }, "project_types": { "type": "array", "items": { diff --git a/etc/stackalytics.conf b/etc/stackalytics.conf index 73a628c3f..df382141b 100644 --- a/etc/stackalytics.conf +++ b/etc/stackalytics.conf @@ -17,6 +17,9 @@ # Port where dashboard listens on # listen_port = 8080 +# Number of days to update members +# days_to_update_members = 7 + # The address of file with corrections data # corrections_uri = https://raw.github.com/stackforge/stackalytics/master/etc/corrections.json diff --git a/etc/test_default_data.json b/etc/test_default_data.json index da5e2f445..2752dfcad 100644 --- a/etc/test_default_data.json +++ b/etc/test_default_data.json @@ -146,6 +146,8 @@ ], "mail_lists": ["http://lists.openstack.org/pipermail/openstack-dev/"], + "member_lists": ["http://www.openstack.org/community/members/profile/"], + "voting_date": "2014-Jan-01", "project_types": [ { diff --git a/stackalytics/processor/config.py b/stackalytics/processor/config.py index 6a8273a0d..9f2eacfb5 100644 --- a/stackalytics/processor/config.py +++ b/stackalytics/processor/config.py @@ -28,6 +28,8 @@ OPTS = [ help='The address dashboard listens on'), cfg.IntOpt('listen-port', default=8080, help='The port dashboard listens on'), + cfg.IntOpt('days_to_update_members', default=7, + help='Number of days to update members'), cfg.StrOpt('corrections-uri', default=('https://raw.github.com/stackforge/stackalytics/' 'master/etc/corrections.json'), diff --git a/stackalytics/processor/default_data_processor.py b/stackalytics/processor/default_data_processor.py index 769bb420c..7e839b573 100644 --- a/stackalytics/processor/default_data_processor.py +++ b/stackalytics/processor/default_data_processor.py @@ -127,6 +127,11 @@ def _store_companies(runtime_storage_inst, companies): for company in companies: for domain in company['domains']: domains_index[domain] = company['company_name'] + + if 'aliases' in company: + for alias in company['aliases']: + domains_index[alias] = company['company_name'] + runtime_storage_inst.set_by_key('companies', domains_index) diff --git a/stackalytics/processor/main.py b/stackalytics/processor/main.py index 8ae79c47e..5745f5065 100644 --- a/stackalytics/processor/main.py +++ b/stackalytics/processor/main.py @@ -26,6 +26,7 @@ from stackalytics.processor import config from stackalytics.processor import default_data_processor from stackalytics.processor import lp from stackalytics.processor import mls +from stackalytics.processor import mps from stackalytics.processor import rcs from stackalytics.processor import record_processor from stackalytics.processor import runtime_storage @@ -140,10 +141,24 @@ def process_mail_list(uri, runtime_storage_inst, record_processor_inst): runtime_storage_inst.set_records(processed_mail_iterator) -def update_records(runtime_storage_inst): +def process_member_list(uri, runtime_storage_inst, record_processor_inst): + member_iterator = mps.log(uri, runtime_storage_inst, + cfg.CONF.days_to_update_members) + member_iterator_typed = _record_typer(member_iterator, 'member') + processed_member_iterator = record_processor_inst.process( + member_iterator_typed) + runtime_storage_inst.set_records(processed_member_iterator) + + +def update_members(runtime_storage_inst, record_processor_inst): + member_lists = runtime_storage_inst.get_by_key('member_lists') or [] + for member_list in member_lists: + process_member_list(member_list, runtime_storage_inst, + record_processor_inst) + + +def update_records(runtime_storage_inst, record_processor_inst): repos = utils.load_repos(runtime_storage_inst) - record_processor_inst = record_processor.RecordProcessor( - runtime_storage_inst) for repo in repos: process_repo(repo, runtime_storage_inst, record_processor_inst) @@ -244,10 +259,16 @@ def main(): update_pids(runtime_storage_inst) - update_records(runtime_storage_inst) + record_processor_inst = record_processor.RecordProcessor( + runtime_storage_inst) + + update_records(runtime_storage_inst, record_processor_inst) apply_corrections(cfg.CONF.corrections_uri, runtime_storage_inst) + # long operation should be the last + update_members(runtime_storage_inst, record_processor_inst) + if __name__ == '__main__': main() diff --git a/stackalytics/processor/mps.py b/stackalytics/processor/mps.py new file mode 100644 index 000000000..41cde1f3e --- /dev/null +++ b/stackalytics/processor/mps.py @@ -0,0 +1,110 @@ +# Copyright (c) 2013 Mirantis Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re +import time + +import six + +from stackalytics.openstack.common import log as logging +from stackalytics.processor import utils + + +LOG = logging.getLogger(__name__) + +NAME_AND_DATE_PATTERN = r'

(?P[^<]*)[\s\S]*?' \ + r'
(?P[^<]*)' +COMPANY_PATTERN = r'Date\sJoined[\s\S]*?(?P[^<]*)' \ + r'[\s\S]*?From\s(?P[\s\S]*?)\(Current\)' + +CNT_EMPTY_MEMBERS = 50 + + +def _convert_str_fields_to_unicode(result): + for field, value in result.iteritems(): + if type(value) is str: + try: + value = six.text_type(value, 'utf8') + result[field] = value + except Exception: + pass + + +def _retrieve_member(uri, member_id): + + content = utils.read_uri(uri) + + if not content: + return {} + + member = {} + + for rec in re.finditer(NAME_AND_DATE_PATTERN, content): + result = rec.groupdict() + + member['member_id'] = member_id + member['member_name'] = result['member_name'] + member['date_joined'] = result['date_joined'] + member['member_uri'] = uri + break + + member['company_draft'] = '*independent' + for rec in re.finditer(COMPANY_PATTERN, content): + result = rec.groupdict() + + member['company_draft'] = result['company_draft'] + + return member + + +def log(uri, runtime_storage_inst, days_to_update_members): + LOG.debug('Retrieving new openstack.org members') + + last_update_members_date = runtime_storage_inst.get_by_key( + 'last_update_members_date') or 0 + last_member_index = runtime_storage_inst.get_by_key( + 'last_member_index') or 0 + + end_update_date = int(time.time()) - days_to_update_members * 24 * 60 * 60 + + if last_update_members_date <= end_update_date: + last_member_index = 0 + last_update_members_date = int(time.time()) + + runtime_storage_inst.set_by_key('last_update_members_date', + last_update_members_date) + + cnt_empty = 0 + cur_index = last_member_index + 1 + + while cnt_empty < CNT_EMPTY_MEMBERS: + + profile_uri = uri + str(cur_index) + member = _retrieve_member(profile_uri, str(cur_index)) + + if 'member_name' not in member: + cnt_empty += 1 + cur_index += 1 + continue + + _convert_str_fields_to_unicode(member) + + cnt_empty = 0 + last_member_index = cur_index + cur_index += 1 + LOG.debug('New member: %s', member['member_id']) + yield member + + LOG.debug('Last_member_index: %s', last_member_index) + runtime_storage_inst.set_by_key('last_member_index', last_member_index) diff --git a/stackalytics/processor/record_processor.py b/stackalytics/processor/record_processor.py index ef8b8572d..265846228 100644 --- a/stackalytics/processor/record_processor.py +++ b/stackalytics/processor/record_processor.py @@ -397,6 +397,38 @@ class RecordProcessor(object): yield bpc + def _process_member(self, record): + user_id = "member:" + record['member_id'] + record['primary_key'] = user_id + record['date'] = utils.member_date_to_timestamp(record['date_joined']) + record['author_name'] = record['member_name'] + record['module'] = 'unknown' + company_draft = record['company_draft'] + + company_name = self.domains_index.get(company_draft) or company_draft + + # author_email is a key to create new user + record['author_email'] = user_id + record['company_name'] = company_name + # _update_record_and_user function will create new user if needed + self._update_record_and_user(record) + record['company_name'] = company_name + user = utils.load_user(self.runtime_storage_inst, user_id) + del record['author_email'] + + user['user_name'] = record['author_name'] + user['companies'] = [{ + 'company_name': company_name, + 'end_date': 0, + }] + user['company_name'] = company_name + + utils.store_user(self.runtime_storage_inst, user) + + record['company_name'] = company_name + + yield record + def _apply_type_based_processing(self, record): if record['record_type'] == 'commit': for r in self._process_commit(record): @@ -410,6 +442,9 @@ class RecordProcessor(object): elif record['record_type'] == 'bp': for r in self._process_blueprint(record): yield r + elif record['record_type'] == 'member': + for r in self._process_member(record): + yield r def _renew_record_date(self, record): record['week'] = utils.timestamp_to_week(record['date']) diff --git a/stackalytics/processor/utils.py b/stackalytics/processor/utils.py index 7942d4e97..c1c3eed78 100644 --- a/stackalytics/processor/utils.py +++ b/stackalytics/processor/utils.py @@ -46,6 +46,13 @@ def date_to_timestamp_ext(d): return int(d) +def member_date_to_timestamp(d): + if not d: + return 0 + return int(time.mktime( + datetime.datetime.strptime(d, '%B %d, %Y ').timetuple())) + + def iso8601_to_timestamp(s): return int(time.mktime(iso8601.parse_date(s).timetuple())) diff --git a/tests/unit/test_mps.py b/tests/unit/test_mps.py new file mode 100644 index 000000000..07984fbdb --- /dev/null +++ b/tests/unit/test_mps.py @@ -0,0 +1,60 @@ +# Copyright (c) 2013 Mirantis Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + +import testtools + +from stackalytics.processor import mps + + +class TestMps(testtools.TestCase): + def setUp(self): + super(TestMps, self).setUp() + + def test_member_parse_regex(self): + + content = '''

Individual Member Profile

+
+
+

 

+
+ +
+
+

Jim Battenberg

+
+
Date Joined
+
June 25, 2013

+
Affiliations
+
+
+ Rackspace From (Current) +
+
+
Statement of Interest
+
+

contribute logic and evangelize openstack

+
+

 

''' + + match = re.search(mps.NAME_AND_DATE_PATTERN, content) + self.assertTrue(match) + self.assertEqual('Jim Battenberg', match.group('member_name')) + self.assertEqual('June 25, 2013 ', match.group('date_joined')) + + match = re.search(mps.COMPANY_PATTERN, content) + self.assertTrue(match) + self.assertEqual('Rackspace', match.group('company_draft')) diff --git a/tests/unit/test_record_processor.py b/tests/unit/test_record_processor.py index e771128b1..019c68d59 100644 --- a/tests/unit/test_record_processor.py +++ b/tests/unit/test_record_processor.py @@ -536,6 +536,58 @@ class TestRecordProcessor(testtools.TestCase): self.assertEqual(user, utils.load_user( record_processor_inst.runtime_storage_inst, 'john_doe@gmail.com')) + def test_create_member(self): + member_record = {'member_id': '123456789', + 'member_name': 'John Doe', + 'member_uri': 'http://www.openstack.org/community' + '/members/profile/123456789', + 'date_joined': 'August 01, 2012 ', + 'company_draft': 'Mirantis'} + + record_processor_inst = self.make_record_processor() + result_member = record_processor_inst._process_member( + member_record).next() + + self.assertEqual(result_member['primary_key'], 'member:123456789') + self.assertEqual(result_member['date'], utils.member_date_to_timestamp( + 'August 01, 2012 ')) + self.assertEqual(result_member['author_name'], 'John Doe') + self.assertEqual(result_member['company_name'], 'Mirantis') + + result_user = utils.load_user( + record_processor_inst.runtime_storage_inst, 'member:123456789') + + self.assertEqual(result_user['user_name'], 'John Doe') + self.assertEqual(result_user['company_name'], 'Mirantis') + self.assertEqual(result_user['companies'], + [{'company_name': 'Mirantis', 'end_date': 0}]) + + def test_update_member(self): + member_record = {'member_id': '123456789', + 'member_name': 'John Doe', + 'member_uri': 'http://www.openstack.org/community' + '/members/profile/123456789', + 'date_joined': 'August 01, 2012 ', + 'company_draft': 'Mirantis'} + + record_processor_inst = self.make_record_processor() + + updated_member_record = member_record + updated_member_record['member_name'] = 'Bill Smith' + updated_member_record['company_draft'] = 'Rackspace' + + result_member = record_processor_inst._process_member( + updated_member_record).next() + self.assertEqual(result_member['author_name'], 'Bill Smith') + self.assertEqual(result_member['company_name'], 'Rackspace') + + result_user = utils.load_user( + record_processor_inst.runtime_storage_inst, 'member:123456789') + + self.assertEqual(result_user['user_name'], 'Bill Smith') + self.assertEqual(result_user['companies'], + [{'company_name': 'Rackspace', 'end_date': 0}]) + def test_process_email_then_review(self): # it is expected that the user profile will contain both email and # LP id