Optimization of email bodies processing

1. Filter out replies from email body
2. Trim email body to 4k

Change-Id: I80f27ad551674a8aa9e5e26faeb424a0b85c24b0
This commit is contained in:
Ilya Shakhat 2015-10-12 15:38:04 +03:00
parent 773cfbaac9
commit bbdd60302a
3 changed files with 55 additions and 1 deletions

View File

@ -48,7 +48,7 @@ MESSAGE_PATTERNS = {
re.IGNORECASE),
}
TRAILING_RECORD = ('From ishakhat at mirantis.com Tue Sep 17 07:30:43 2013'
TRAILING_RECORD = ('From ishakhat at mirantis.com Tue Sep 17 07:30:43 2013\n'
'From: ')
@ -71,6 +71,20 @@ def _uri_content_changed(uri, runtime_storage_inst):
return False
def _optimize_body(email_body):
result = []
for line in email_body.split('\n'):
line = line.strip()
if line[:1] == '>' or line[:8] == '--------':
continue # ignore replies and part delimiters
if (not result) or (result and result[-1] != line):
result.append(line)
return '\n'.join(result)
def _retrieve_mails(uri):
LOG.debug('Retrieving mail archive from: %s', uri)
content = utils.read_gzip_from_uri(uri)
@ -91,6 +105,8 @@ def _retrieve_mails(uri):
email['date'] = int(email_utils.mktime_tz(
email_utils.parsedate_tz(email['date'])))
email['body'] = _optimize_body(email['body'])
for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS):
collection = set()
for item in re.finditer(pattern, email['body']):

View File

@ -459,6 +459,8 @@ class RecordProcessor(object):
if not record.get('blueprint_id'):
del record['body']
elif len(record['body']) > 4000:
record['body'] = record['body'][:4000] + '...'
yield record

View File

@ -15,11 +15,31 @@
import re
import mock
import testtools
from stackalytics.processor import mls
EMAIL_CONTENT = '''
From sorlando at nicira.com Tue Jul 17 07:30:43 2012
From: sorlando at nicira.com (Salvatore Orlando)
Date: Tue, 17 Jul 2012 00:30:43 -0700
Subject: [openstack-dev] [nova] [pci device passthrough] fails with
"NameError: global name '_' is not defined"
In-Reply-To: <5004FBF1.1080102@redhat.com>
References: <5004FBF1.1080102@redhat.com>
Message-ID: <CAGR=i3htLvDOdh5u6mxqmo0zVP1eKKYAxAhj=e1-rpQWZOiF6Q@gmail.com>
Good morning Gary!
-----------------
test works :)
> Reply
'''
class TestMls(testtools.TestCase):
def setUp(self):
super(TestMls, self).setUp()
@ -57,3 +77,19 @@ From: sorlando at nicira.com (Salvatore Orlando)
'e1-rpQWZOiF6Q@gmail.com>', match.group(5))
self.assertEqual('Good morning Gary!\n\ntest works :)\n',
match.group(6))
@mock.patch('stackalytics.processor.utils.read_gzip_from_uri')
@mock.patch('stackalytics.processor.mls._get_mail_archive_links')
@mock.patch('stackalytics.processor.mls._uri_content_changed')
def test_log(self, mock_uri_content_changed, mock_get_mail_archive_links,
mock_read_gzip_from_uri):
mock_uri_content_changed.return_value = True
mock_get_mail_archive_links.return_value = ['link']
mock_read_gzip_from_uri.return_value = EMAIL_CONTENT
mock_rsi = mock.Mock()
emails = list(mls.log('uri', mock_rsi))
self.assertEqual(1, len(emails))
self.assertEqual('Good morning Gary!\n\ntest works :)\n',
emails[0]['body'])