Backend for openstack.org members report

Implemented backend

Change-Id: I5c2fbb51eeed3a70f22fa7bde2e77b492e2060a3
This commit is contained in:
pkholkin 2014-04-01 13:35:35 +04:00
parent 81c3df03d9
commit ed515b4be9
12 changed files with 312 additions and 4 deletions

View File

@ -6767,6 +6767,8 @@
}
],
"mail_lists": ["http://lists.openstack.org/pipermail/openstack-dev/"],
"member_lists": ["http://www.openstack.org/community/members/profile/"],
"voting_date": "2014-Jan-01",
"project_types": [
{
"id": "all",

View File

@ -181,6 +181,15 @@
"type": "string"
}
},
"member_lists": {
"type": "array",
"items": {
"type": "string"
}
},
"voting_date": {
"type": "string"
},
"project_types": {
"type": "array",
"items": {

View File

@ -17,6 +17,9 @@
# Port where dashboard listens on
# listen_port = 8080
# Number of days to update members
# days_to_update_members = 7
# The address of file with corrections data
# corrections_uri = https://raw.github.com/stackforge/stackalytics/master/etc/corrections.json

View File

@ -146,6 +146,8 @@
],
"mail_lists": ["http://lists.openstack.org/pipermail/openstack-dev/"],
"member_lists": ["http://www.openstack.org/community/members/profile/"],
"voting_date": "2014-Jan-01",
"project_types": [
{

View File

@ -28,6 +28,8 @@ OPTS = [
help='The address dashboard listens on'),
cfg.IntOpt('listen-port', default=8080,
help='The port dashboard listens on'),
cfg.IntOpt('days_to_update_members', default=7,
help='Number of days to update members'),
cfg.StrOpt('corrections-uri',
default=('https://raw.github.com/stackforge/stackalytics/'
'master/etc/corrections.json'),

View File

@ -127,6 +127,11 @@ def _store_companies(runtime_storage_inst, companies):
for company in companies:
for domain in company['domains']:
domains_index[domain] = company['company_name']
if 'aliases' in company:
for alias in company['aliases']:
domains_index[alias] = company['company_name']
runtime_storage_inst.set_by_key('companies', domains_index)

View File

@ -26,6 +26,7 @@ from stackalytics.processor import config
from stackalytics.processor import default_data_processor
from stackalytics.processor import lp
from stackalytics.processor import mls
from stackalytics.processor import mps
from stackalytics.processor import rcs
from stackalytics.processor import record_processor
from stackalytics.processor import runtime_storage
@ -140,10 +141,24 @@ def process_mail_list(uri, runtime_storage_inst, record_processor_inst):
runtime_storage_inst.set_records(processed_mail_iterator)
def update_records(runtime_storage_inst):
def process_member_list(uri, runtime_storage_inst, record_processor_inst):
member_iterator = mps.log(uri, runtime_storage_inst,
cfg.CONF.days_to_update_members)
member_iterator_typed = _record_typer(member_iterator, 'member')
processed_member_iterator = record_processor_inst.process(
member_iterator_typed)
runtime_storage_inst.set_records(processed_member_iterator)
def update_members(runtime_storage_inst, record_processor_inst):
member_lists = runtime_storage_inst.get_by_key('member_lists') or []
for member_list in member_lists:
process_member_list(member_list, runtime_storage_inst,
record_processor_inst)
def update_records(runtime_storage_inst, record_processor_inst):
repos = utils.load_repos(runtime_storage_inst)
record_processor_inst = record_processor.RecordProcessor(
runtime_storage_inst)
for repo in repos:
process_repo(repo, runtime_storage_inst, record_processor_inst)
@ -244,10 +259,16 @@ def main():
update_pids(runtime_storage_inst)
update_records(runtime_storage_inst)
record_processor_inst = record_processor.RecordProcessor(
runtime_storage_inst)
update_records(runtime_storage_inst, record_processor_inst)
apply_corrections(cfg.CONF.corrections_uri, runtime_storage_inst)
# long operation should be the last
update_members(runtime_storage_inst, record_processor_inst)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,110 @@
# Copyright (c) 2013 Mirantis Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import time
import six
from stackalytics.openstack.common import log as logging
from stackalytics.processor import utils
LOG = logging.getLogger(__name__)
NAME_AND_DATE_PATTERN = r'<h3>(?P<member_name>[^<]*)[\s\S]*?' \
r'<div class="span-7 last">(?P<date_joined>[^<]*)'
COMPANY_PATTERN = r'<strong>Date\sJoined[\s\S]*?<b>(?P<company_draft>[^<]*)' \
r'[\s\S]*?From\s(?P<date_from>[\s\S]*?)\(Current\)'
CNT_EMPTY_MEMBERS = 50
def _convert_str_fields_to_unicode(result):
for field, value in result.iteritems():
if type(value) is str:
try:
value = six.text_type(value, 'utf8')
result[field] = value
except Exception:
pass
def _retrieve_member(uri, member_id):
content = utils.read_uri(uri)
if not content:
return {}
member = {}
for rec in re.finditer(NAME_AND_DATE_PATTERN, content):
result = rec.groupdict()
member['member_id'] = member_id
member['member_name'] = result['member_name']
member['date_joined'] = result['date_joined']
member['member_uri'] = uri
break
member['company_draft'] = '*independent'
for rec in re.finditer(COMPANY_PATTERN, content):
result = rec.groupdict()
member['company_draft'] = result['company_draft']
return member
def log(uri, runtime_storage_inst, days_to_update_members):
LOG.debug('Retrieving new openstack.org members')
last_update_members_date = runtime_storage_inst.get_by_key(
'last_update_members_date') or 0
last_member_index = runtime_storage_inst.get_by_key(
'last_member_index') or 0
end_update_date = int(time.time()) - days_to_update_members * 24 * 60 * 60
if last_update_members_date <= end_update_date:
last_member_index = 0
last_update_members_date = int(time.time())
runtime_storage_inst.set_by_key('last_update_members_date',
last_update_members_date)
cnt_empty = 0
cur_index = last_member_index + 1
while cnt_empty < CNT_EMPTY_MEMBERS:
profile_uri = uri + str(cur_index)
member = _retrieve_member(profile_uri, str(cur_index))
if 'member_name' not in member:
cnt_empty += 1
cur_index += 1
continue
_convert_str_fields_to_unicode(member)
cnt_empty = 0
last_member_index = cur_index
cur_index += 1
LOG.debug('New member: %s', member['member_id'])
yield member
LOG.debug('Last_member_index: %s', last_member_index)
runtime_storage_inst.set_by_key('last_member_index', last_member_index)

View File

@ -397,6 +397,38 @@ class RecordProcessor(object):
yield bpc
def _process_member(self, record):
user_id = "member:" + record['member_id']
record['primary_key'] = user_id
record['date'] = utils.member_date_to_timestamp(record['date_joined'])
record['author_name'] = record['member_name']
record['module'] = 'unknown'
company_draft = record['company_draft']
company_name = self.domains_index.get(company_draft) or company_draft
# author_email is a key to create new user
record['author_email'] = user_id
record['company_name'] = company_name
# _update_record_and_user function will create new user if needed
self._update_record_and_user(record)
record['company_name'] = company_name
user = utils.load_user(self.runtime_storage_inst, user_id)
del record['author_email']
user['user_name'] = record['author_name']
user['companies'] = [{
'company_name': company_name,
'end_date': 0,
}]
user['company_name'] = company_name
utils.store_user(self.runtime_storage_inst, user)
record['company_name'] = company_name
yield record
def _apply_type_based_processing(self, record):
if record['record_type'] == 'commit':
for r in self._process_commit(record):
@ -410,6 +442,9 @@ class RecordProcessor(object):
elif record['record_type'] == 'bp':
for r in self._process_blueprint(record):
yield r
elif record['record_type'] == 'member':
for r in self._process_member(record):
yield r
def _renew_record_date(self, record):
record['week'] = utils.timestamp_to_week(record['date'])

View File

@ -46,6 +46,13 @@ def date_to_timestamp_ext(d):
return int(d)
def member_date_to_timestamp(d):
if not d:
return 0
return int(time.mktime(
datetime.datetime.strptime(d, '%B %d, %Y ').timetuple()))
def iso8601_to_timestamp(s):
return int(time.mktime(iso8601.parse_date(s).timetuple()))

60
tests/unit/test_mps.py Normal file
View File

@ -0,0 +1,60 @@
# Copyright (c) 2013 Mirantis Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import testtools
from stackalytics.processor import mps
class TestMps(testtools.TestCase):
def setUp(self):
super(TestMps, self).setUp()
def test_member_parse_regex(self):
content = '''<h1>Individual Member Profile</h1>
<div class="candidate span-14">
<div class="span-4">
<img src="/themes/openstack/images/generic-profile-photo.png"><p>&nbsp;</p>
</div>
<a name="profile-10501"></a>
<div class="details span-10 last">
<div class="last name-and-title">
<h3>Jim Battenberg</h3>
</div>
<hr><div class="span-3"><strong>Date Joined</strong></div>
<div class="span-7 last">June 25, 2013 <br><br></div>
<div class="span-3"><strong>Affiliations</strong></div>
<div class="span-7 last">
<div>
<b>Rackspace</b> From (Current)
</div>
</div>
<div class="span-3"><strong>Statement of Interest </strong></div>
<div class="span-7 last">
<p>contribute logic and evangelize openstack</p>
</div>
<p>&nbsp;</p>'''
match = re.search(mps.NAME_AND_DATE_PATTERN, content)
self.assertTrue(match)
self.assertEqual('Jim Battenberg', match.group('member_name'))
self.assertEqual('June 25, 2013 ', match.group('date_joined'))
match = re.search(mps.COMPANY_PATTERN, content)
self.assertTrue(match)
self.assertEqual('Rackspace', match.group('company_draft'))

View File

@ -536,6 +536,58 @@ class TestRecordProcessor(testtools.TestCase):
self.assertEqual(user, utils.load_user(
record_processor_inst.runtime_storage_inst, 'john_doe@gmail.com'))
def test_create_member(self):
member_record = {'member_id': '123456789',
'member_name': 'John Doe',
'member_uri': 'http://www.openstack.org/community'
'/members/profile/123456789',
'date_joined': 'August 01, 2012 ',
'company_draft': 'Mirantis'}
record_processor_inst = self.make_record_processor()
result_member = record_processor_inst._process_member(
member_record).next()
self.assertEqual(result_member['primary_key'], 'member:123456789')
self.assertEqual(result_member['date'], utils.member_date_to_timestamp(
'August 01, 2012 '))
self.assertEqual(result_member['author_name'], 'John Doe')
self.assertEqual(result_member['company_name'], 'Mirantis')
result_user = utils.load_user(
record_processor_inst.runtime_storage_inst, 'member:123456789')
self.assertEqual(result_user['user_name'], 'John Doe')
self.assertEqual(result_user['company_name'], 'Mirantis')
self.assertEqual(result_user['companies'],
[{'company_name': 'Mirantis', 'end_date': 0}])
def test_update_member(self):
member_record = {'member_id': '123456789',
'member_name': 'John Doe',
'member_uri': 'http://www.openstack.org/community'
'/members/profile/123456789',
'date_joined': 'August 01, 2012 ',
'company_draft': 'Mirantis'}
record_processor_inst = self.make_record_processor()
updated_member_record = member_record
updated_member_record['member_name'] = 'Bill Smith'
updated_member_record['company_draft'] = 'Rackspace'
result_member = record_processor_inst._process_member(
updated_member_record).next()
self.assertEqual(result_member['author_name'], 'Bill Smith')
self.assertEqual(result_member['company_name'], 'Rackspace')
result_user = utils.load_user(
record_processor_inst.runtime_storage_inst, 'member:123456789')
self.assertEqual(result_user['user_name'], 'Bill Smith')
self.assertEqual(result_user['companies'],
[{'company_name': 'Rackspace', 'end_date': 0}])
def test_process_email_then_review(self):
# it is expected that the user profile will contain both email and
# LP id