Backend for openstack.org members report

Implemented backend

Change-Id: I5c2fbb51eeed3a70f22fa7bde2e77b492e2060a3
This commit is contained in:
pkholkin 2014-04-01 13:35:35 +04:00
parent 81c3df03d9
commit ed515b4be9
12 changed files with 312 additions and 4 deletions

View File

@ -6767,6 +6767,8 @@
} }
], ],
"mail_lists": ["http://lists.openstack.org/pipermail/openstack-dev/"], "mail_lists": ["http://lists.openstack.org/pipermail/openstack-dev/"],
"member_lists": ["http://www.openstack.org/community/members/profile/"],
"voting_date": "2014-Jan-01",
"project_types": [ "project_types": [
{ {
"id": "all", "id": "all",

View File

@ -181,6 +181,15 @@
"type": "string" "type": "string"
} }
}, },
"member_lists": {
"type": "array",
"items": {
"type": "string"
}
},
"voting_date": {
"type": "string"
},
"project_types": { "project_types": {
"type": "array", "type": "array",
"items": { "items": {

View File

@ -17,6 +17,9 @@
# Port where dashboard listens on # Port where dashboard listens on
# listen_port = 8080 # listen_port = 8080
# Number of days to update members
# days_to_update_members = 7
# The address of file with corrections data # The address of file with corrections data
# corrections_uri = https://raw.github.com/stackforge/stackalytics/master/etc/corrections.json # corrections_uri = https://raw.github.com/stackforge/stackalytics/master/etc/corrections.json

View File

@ -146,6 +146,8 @@
], ],
"mail_lists": ["http://lists.openstack.org/pipermail/openstack-dev/"], "mail_lists": ["http://lists.openstack.org/pipermail/openstack-dev/"],
"member_lists": ["http://www.openstack.org/community/members/profile/"],
"voting_date": "2014-Jan-01",
"project_types": [ "project_types": [
{ {

View File

@ -28,6 +28,8 @@ OPTS = [
help='The address dashboard listens on'), help='The address dashboard listens on'),
cfg.IntOpt('listen-port', default=8080, cfg.IntOpt('listen-port', default=8080,
help='The port dashboard listens on'), help='The port dashboard listens on'),
cfg.IntOpt('days_to_update_members', default=7,
help='Number of days to update members'),
cfg.StrOpt('corrections-uri', cfg.StrOpt('corrections-uri',
default=('https://raw.github.com/stackforge/stackalytics/' default=('https://raw.github.com/stackforge/stackalytics/'
'master/etc/corrections.json'), 'master/etc/corrections.json'),

View File

@ -127,6 +127,11 @@ def _store_companies(runtime_storage_inst, companies):
for company in companies: for company in companies:
for domain in company['domains']: for domain in company['domains']:
domains_index[domain] = company['company_name'] domains_index[domain] = company['company_name']
if 'aliases' in company:
for alias in company['aliases']:
domains_index[alias] = company['company_name']
runtime_storage_inst.set_by_key('companies', domains_index) runtime_storage_inst.set_by_key('companies', domains_index)

View File

@ -26,6 +26,7 @@ from stackalytics.processor import config
from stackalytics.processor import default_data_processor from stackalytics.processor import default_data_processor
from stackalytics.processor import lp from stackalytics.processor import lp
from stackalytics.processor import mls from stackalytics.processor import mls
from stackalytics.processor import mps
from stackalytics.processor import rcs from stackalytics.processor import rcs
from stackalytics.processor import record_processor from stackalytics.processor import record_processor
from stackalytics.processor import runtime_storage from stackalytics.processor import runtime_storage
@ -140,10 +141,24 @@ def process_mail_list(uri, runtime_storage_inst, record_processor_inst):
runtime_storage_inst.set_records(processed_mail_iterator) runtime_storage_inst.set_records(processed_mail_iterator)
def update_records(runtime_storage_inst): def process_member_list(uri, runtime_storage_inst, record_processor_inst):
member_iterator = mps.log(uri, runtime_storage_inst,
cfg.CONF.days_to_update_members)
member_iterator_typed = _record_typer(member_iterator, 'member')
processed_member_iterator = record_processor_inst.process(
member_iterator_typed)
runtime_storage_inst.set_records(processed_member_iterator)
def update_members(runtime_storage_inst, record_processor_inst):
member_lists = runtime_storage_inst.get_by_key('member_lists') or []
for member_list in member_lists:
process_member_list(member_list, runtime_storage_inst,
record_processor_inst)
def update_records(runtime_storage_inst, record_processor_inst):
repos = utils.load_repos(runtime_storage_inst) repos = utils.load_repos(runtime_storage_inst)
record_processor_inst = record_processor.RecordProcessor(
runtime_storage_inst)
for repo in repos: for repo in repos:
process_repo(repo, runtime_storage_inst, record_processor_inst) process_repo(repo, runtime_storage_inst, record_processor_inst)
@ -244,10 +259,16 @@ def main():
update_pids(runtime_storage_inst) update_pids(runtime_storage_inst)
update_records(runtime_storage_inst) record_processor_inst = record_processor.RecordProcessor(
runtime_storage_inst)
update_records(runtime_storage_inst, record_processor_inst)
apply_corrections(cfg.CONF.corrections_uri, runtime_storage_inst) apply_corrections(cfg.CONF.corrections_uri, runtime_storage_inst)
# long operation should be the last
update_members(runtime_storage_inst, record_processor_inst)
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@ -0,0 +1,110 @@
# Copyright (c) 2013 Mirantis Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import time
import six
from stackalytics.openstack.common import log as logging
from stackalytics.processor import utils
LOG = logging.getLogger(__name__)
NAME_AND_DATE_PATTERN = r'<h3>(?P<member_name>[^<]*)[\s\S]*?' \
r'<div class="span-7 last">(?P<date_joined>[^<]*)'
COMPANY_PATTERN = r'<strong>Date\sJoined[\s\S]*?<b>(?P<company_draft>[^<]*)' \
r'[\s\S]*?From\s(?P<date_from>[\s\S]*?)\(Current\)'
CNT_EMPTY_MEMBERS = 50
def _convert_str_fields_to_unicode(result):
for field, value in result.iteritems():
if type(value) is str:
try:
value = six.text_type(value, 'utf8')
result[field] = value
except Exception:
pass
def _retrieve_member(uri, member_id):
content = utils.read_uri(uri)
if not content:
return {}
member = {}
for rec in re.finditer(NAME_AND_DATE_PATTERN, content):
result = rec.groupdict()
member['member_id'] = member_id
member['member_name'] = result['member_name']
member['date_joined'] = result['date_joined']
member['member_uri'] = uri
break
member['company_draft'] = '*independent'
for rec in re.finditer(COMPANY_PATTERN, content):
result = rec.groupdict()
member['company_draft'] = result['company_draft']
return member
def log(uri, runtime_storage_inst, days_to_update_members):
LOG.debug('Retrieving new openstack.org members')
last_update_members_date = runtime_storage_inst.get_by_key(
'last_update_members_date') or 0
last_member_index = runtime_storage_inst.get_by_key(
'last_member_index') or 0
end_update_date = int(time.time()) - days_to_update_members * 24 * 60 * 60
if last_update_members_date <= end_update_date:
last_member_index = 0
last_update_members_date = int(time.time())
runtime_storage_inst.set_by_key('last_update_members_date',
last_update_members_date)
cnt_empty = 0
cur_index = last_member_index + 1
while cnt_empty < CNT_EMPTY_MEMBERS:
profile_uri = uri + str(cur_index)
member = _retrieve_member(profile_uri, str(cur_index))
if 'member_name' not in member:
cnt_empty += 1
cur_index += 1
continue
_convert_str_fields_to_unicode(member)
cnt_empty = 0
last_member_index = cur_index
cur_index += 1
LOG.debug('New member: %s', member['member_id'])
yield member
LOG.debug('Last_member_index: %s', last_member_index)
runtime_storage_inst.set_by_key('last_member_index', last_member_index)

View File

@ -397,6 +397,38 @@ class RecordProcessor(object):
yield bpc yield bpc
def _process_member(self, record):
user_id = "member:" + record['member_id']
record['primary_key'] = user_id
record['date'] = utils.member_date_to_timestamp(record['date_joined'])
record['author_name'] = record['member_name']
record['module'] = 'unknown'
company_draft = record['company_draft']
company_name = self.domains_index.get(company_draft) or company_draft
# author_email is a key to create new user
record['author_email'] = user_id
record['company_name'] = company_name
# _update_record_and_user function will create new user if needed
self._update_record_and_user(record)
record['company_name'] = company_name
user = utils.load_user(self.runtime_storage_inst, user_id)
del record['author_email']
user['user_name'] = record['author_name']
user['companies'] = [{
'company_name': company_name,
'end_date': 0,
}]
user['company_name'] = company_name
utils.store_user(self.runtime_storage_inst, user)
record['company_name'] = company_name
yield record
def _apply_type_based_processing(self, record): def _apply_type_based_processing(self, record):
if record['record_type'] == 'commit': if record['record_type'] == 'commit':
for r in self._process_commit(record): for r in self._process_commit(record):
@ -410,6 +442,9 @@ class RecordProcessor(object):
elif record['record_type'] == 'bp': elif record['record_type'] == 'bp':
for r in self._process_blueprint(record): for r in self._process_blueprint(record):
yield r yield r
elif record['record_type'] == 'member':
for r in self._process_member(record):
yield r
def _renew_record_date(self, record): def _renew_record_date(self, record):
record['week'] = utils.timestamp_to_week(record['date']) record['week'] = utils.timestamp_to_week(record['date'])

View File

@ -46,6 +46,13 @@ def date_to_timestamp_ext(d):
return int(d) return int(d)
def member_date_to_timestamp(d):
if not d:
return 0
return int(time.mktime(
datetime.datetime.strptime(d, '%B %d, %Y ').timetuple()))
def iso8601_to_timestamp(s): def iso8601_to_timestamp(s):
return int(time.mktime(iso8601.parse_date(s).timetuple())) return int(time.mktime(iso8601.parse_date(s).timetuple()))

60
tests/unit/test_mps.py Normal file
View File

@ -0,0 +1,60 @@
# Copyright (c) 2013 Mirantis Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import testtools
from stackalytics.processor import mps
class TestMps(testtools.TestCase):
def setUp(self):
super(TestMps, self).setUp()
def test_member_parse_regex(self):
content = '''<h1>Individual Member Profile</h1>
<div class="candidate span-14">
<div class="span-4">
<img src="/themes/openstack/images/generic-profile-photo.png"><p>&nbsp;</p>
</div>
<a name="profile-10501"></a>
<div class="details span-10 last">
<div class="last name-and-title">
<h3>Jim Battenberg</h3>
</div>
<hr><div class="span-3"><strong>Date Joined</strong></div>
<div class="span-7 last">June 25, 2013 <br><br></div>
<div class="span-3"><strong>Affiliations</strong></div>
<div class="span-7 last">
<div>
<b>Rackspace</b> From (Current)
</div>
</div>
<div class="span-3"><strong>Statement of Interest </strong></div>
<div class="span-7 last">
<p>contribute logic and evangelize openstack</p>
</div>
<p>&nbsp;</p>'''
match = re.search(mps.NAME_AND_DATE_PATTERN, content)
self.assertTrue(match)
self.assertEqual('Jim Battenberg', match.group('member_name'))
self.assertEqual('June 25, 2013 ', match.group('date_joined'))
match = re.search(mps.COMPANY_PATTERN, content)
self.assertTrue(match)
self.assertEqual('Rackspace', match.group('company_draft'))

View File

@ -536,6 +536,58 @@ class TestRecordProcessor(testtools.TestCase):
self.assertEqual(user, utils.load_user( self.assertEqual(user, utils.load_user(
record_processor_inst.runtime_storage_inst, 'john_doe@gmail.com')) record_processor_inst.runtime_storage_inst, 'john_doe@gmail.com'))
def test_create_member(self):
member_record = {'member_id': '123456789',
'member_name': 'John Doe',
'member_uri': 'http://www.openstack.org/community'
'/members/profile/123456789',
'date_joined': 'August 01, 2012 ',
'company_draft': 'Mirantis'}
record_processor_inst = self.make_record_processor()
result_member = record_processor_inst._process_member(
member_record).next()
self.assertEqual(result_member['primary_key'], 'member:123456789')
self.assertEqual(result_member['date'], utils.member_date_to_timestamp(
'August 01, 2012 '))
self.assertEqual(result_member['author_name'], 'John Doe')
self.assertEqual(result_member['company_name'], 'Mirantis')
result_user = utils.load_user(
record_processor_inst.runtime_storage_inst, 'member:123456789')
self.assertEqual(result_user['user_name'], 'John Doe')
self.assertEqual(result_user['company_name'], 'Mirantis')
self.assertEqual(result_user['companies'],
[{'company_name': 'Mirantis', 'end_date': 0}])
def test_update_member(self):
member_record = {'member_id': '123456789',
'member_name': 'John Doe',
'member_uri': 'http://www.openstack.org/community'
'/members/profile/123456789',
'date_joined': 'August 01, 2012 ',
'company_draft': 'Mirantis'}
record_processor_inst = self.make_record_processor()
updated_member_record = member_record
updated_member_record['member_name'] = 'Bill Smith'
updated_member_record['company_draft'] = 'Rackspace'
result_member = record_processor_inst._process_member(
updated_member_record).next()
self.assertEqual(result_member['author_name'], 'Bill Smith')
self.assertEqual(result_member['company_name'], 'Rackspace')
result_user = utils.load_user(
record_processor_inst.runtime_storage_inst, 'member:123456789')
self.assertEqual(result_user['user_name'], 'Bill Smith')
self.assertEqual(result_user['companies'],
[{'company_name': 'Rackspace', 'end_date': 0}])
def test_process_email_then_review(self): def test_process_email_then_review(self):
# it is expected that the user profile will contain both email and # it is expected that the user profile will contain both email and
# LP id # LP id