From bbdd60302a68433df61bb8f39599061e71b46487 Mon Sep 17 00:00:00 2001 From: Ilya Shakhat Date: Mon, 12 Oct 2015 15:38:04 +0300 Subject: [PATCH] Optimization of email bodies processing 1. Filter out replies from email body 2. Trim email body to 4k Change-Id: I80f27ad551674a8aa9e5e26faeb424a0b85c24b0 --- stackalytics/processor/mls.py | 18 ++++++++++- stackalytics/processor/record_processor.py | 2 ++ stackalytics/tests/unit/test_mls.py | 36 ++++++++++++++++++++++ 3 files changed, 55 insertions(+), 1 deletion(-) diff --git a/stackalytics/processor/mls.py b/stackalytics/processor/mls.py index 191ad6fd6..b05be3dcc 100644 --- a/stackalytics/processor/mls.py +++ b/stackalytics/processor/mls.py @@ -48,7 +48,7 @@ MESSAGE_PATTERNS = { re.IGNORECASE), } -TRAILING_RECORD = ('From ishakhat at mirantis.com Tue Sep 17 07:30:43 2013' +TRAILING_RECORD = ('From ishakhat at mirantis.com Tue Sep 17 07:30:43 2013\n' 'From: ') @@ -71,6 +71,20 @@ def _uri_content_changed(uri, runtime_storage_inst): return False +def _optimize_body(email_body): + result = [] + for line in email_body.split('\n'): + line = line.strip() + + if line[:1] == '>' or line[:8] == '--------': + continue # ignore replies and part delimiters + + if (not result) or (result and result[-1] != line): + result.append(line) + + return '\n'.join(result) + + def _retrieve_mails(uri): LOG.debug('Retrieving mail archive from: %s', uri) content = utils.read_gzip_from_uri(uri) @@ -91,6 +105,8 @@ def _retrieve_mails(uri): email['date'] = int(email_utils.mktime_tz( email_utils.parsedate_tz(email['date']))) + email['body'] = _optimize_body(email['body']) + for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS): collection = set() for item in re.finditer(pattern, email['body']): diff --git a/stackalytics/processor/record_processor.py b/stackalytics/processor/record_processor.py index 6a71749b2..997c41184 100644 --- a/stackalytics/processor/record_processor.py +++ b/stackalytics/processor/record_processor.py @@ -459,6 +459,8 @@ class RecordProcessor(object): if not record.get('blueprint_id'): del record['body'] + elif len(record['body']) > 4000: + record['body'] = record['body'][:4000] + '...' yield record diff --git a/stackalytics/tests/unit/test_mls.py b/stackalytics/tests/unit/test_mls.py index b92d75a01..e034cd7d1 100644 --- a/stackalytics/tests/unit/test_mls.py +++ b/stackalytics/tests/unit/test_mls.py @@ -15,11 +15,31 @@ import re +import mock import testtools from stackalytics.processor import mls +EMAIL_CONTENT = ''' +From sorlando at nicira.com Tue Jul 17 07:30:43 2012 +From: sorlando at nicira.com (Salvatore Orlando) +Date: Tue, 17 Jul 2012 00:30:43 -0700 +Subject: [openstack-dev] [nova] [pci device passthrough] fails with + "NameError: global name '_' is not defined" +In-Reply-To: <5004FBF1.1080102@redhat.com> +References: <5004FBF1.1080102@redhat.com> +Message-ID: + +Good morning Gary! +----------------- + +test works :) + +> Reply +''' + + class TestMls(testtools.TestCase): def setUp(self): super(TestMls, self).setUp() @@ -57,3 +77,19 @@ From: sorlando at nicira.com (Salvatore Orlando) 'e1-rpQWZOiF6Q@gmail.com>', match.group(5)) self.assertEqual('Good morning Gary!\n\ntest works :)\n', match.group(6)) + + @mock.patch('stackalytics.processor.utils.read_gzip_from_uri') + @mock.patch('stackalytics.processor.mls._get_mail_archive_links') + @mock.patch('stackalytics.processor.mls._uri_content_changed') + def test_log(self, mock_uri_content_changed, mock_get_mail_archive_links, + mock_read_gzip_from_uri): + mock_uri_content_changed.return_value = True + mock_get_mail_archive_links.return_value = ['link'] + mock_read_gzip_from_uri.return_value = EMAIL_CONTENT + mock_rsi = mock.Mock() + + emails = list(mls.log('uri', mock_rsi)) + + self.assertEqual(1, len(emails)) + self.assertEqual('Good morning Gary!\n\ntest works :)\n', + emails[0]['body'])