Optimization of email bodies processing
1. Filter out replies from email body 2. Trim email body to 4k Change-Id: I80f27ad551674a8aa9e5e26faeb424a0b85c24b0
This commit is contained in:
parent
773cfbaac9
commit
bbdd60302a
@ -48,7 +48,7 @@ MESSAGE_PATTERNS = {
|
||||
re.IGNORECASE),
|
||||
}
|
||||
|
||||
TRAILING_RECORD = ('From ishakhat at mirantis.com Tue Sep 17 07:30:43 2013'
|
||||
TRAILING_RECORD = ('From ishakhat at mirantis.com Tue Sep 17 07:30:43 2013\n'
|
||||
'From: ')
|
||||
|
||||
|
||||
@ -71,6 +71,20 @@ def _uri_content_changed(uri, runtime_storage_inst):
|
||||
return False
|
||||
|
||||
|
||||
def _optimize_body(email_body):
|
||||
result = []
|
||||
for line in email_body.split('\n'):
|
||||
line = line.strip()
|
||||
|
||||
if line[:1] == '>' or line[:8] == '--------':
|
||||
continue # ignore replies and part delimiters
|
||||
|
||||
if (not result) or (result and result[-1] != line):
|
||||
result.append(line)
|
||||
|
||||
return '\n'.join(result)
|
||||
|
||||
|
||||
def _retrieve_mails(uri):
|
||||
LOG.debug('Retrieving mail archive from: %s', uri)
|
||||
content = utils.read_gzip_from_uri(uri)
|
||||
@ -91,6 +105,8 @@ def _retrieve_mails(uri):
|
||||
email['date'] = int(email_utils.mktime_tz(
|
||||
email_utils.parsedate_tz(email['date'])))
|
||||
|
||||
email['body'] = _optimize_body(email['body'])
|
||||
|
||||
for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS):
|
||||
collection = set()
|
||||
for item in re.finditer(pattern, email['body']):
|
||||
|
@ -459,6 +459,8 @@ class RecordProcessor(object):
|
||||
|
||||
if not record.get('blueprint_id'):
|
||||
del record['body']
|
||||
elif len(record['body']) > 4000:
|
||||
record['body'] = record['body'][:4000] + '...'
|
||||
|
||||
yield record
|
||||
|
||||
|
@ -15,11 +15,31 @@
|
||||
|
||||
import re
|
||||
|
||||
import mock
|
||||
import testtools
|
||||
|
||||
from stackalytics.processor import mls
|
||||
|
||||
|
||||
EMAIL_CONTENT = '''
|
||||
From sorlando at nicira.com Tue Jul 17 07:30:43 2012
|
||||
From: sorlando at nicira.com (Salvatore Orlando)
|
||||
Date: Tue, 17 Jul 2012 00:30:43 -0700
|
||||
Subject: [openstack-dev] [nova] [pci device passthrough] fails with
|
||||
"NameError: global name '_' is not defined"
|
||||
In-Reply-To: <5004FBF1.1080102@redhat.com>
|
||||
References: <5004FBF1.1080102@redhat.com>
|
||||
Message-ID: <CAGR=i3htLvDOdh5u6mxqmo0zVP1eKKYAxAhj=e1-rpQWZOiF6Q@gmail.com>
|
||||
|
||||
Good morning Gary!
|
||||
-----------------
|
||||
|
||||
test works :)
|
||||
|
||||
> Reply
|
||||
'''
|
||||
|
||||
|
||||
class TestMls(testtools.TestCase):
|
||||
def setUp(self):
|
||||
super(TestMls, self).setUp()
|
||||
@ -57,3 +77,19 @@ From: sorlando at nicira.com (Salvatore Orlando)
|
||||
'e1-rpQWZOiF6Q@gmail.com>', match.group(5))
|
||||
self.assertEqual('Good morning Gary!\n\ntest works :)\n',
|
||||
match.group(6))
|
||||
|
||||
@mock.patch('stackalytics.processor.utils.read_gzip_from_uri')
|
||||
@mock.patch('stackalytics.processor.mls._get_mail_archive_links')
|
||||
@mock.patch('stackalytics.processor.mls._uri_content_changed')
|
||||
def test_log(self, mock_uri_content_changed, mock_get_mail_archive_links,
|
||||
mock_read_gzip_from_uri):
|
||||
mock_uri_content_changed.return_value = True
|
||||
mock_get_mail_archive_links.return_value = ['link']
|
||||
mock_read_gzip_from_uri.return_value = EMAIL_CONTENT
|
||||
mock_rsi = mock.Mock()
|
||||
|
||||
emails = list(mls.log('uri', mock_rsi))
|
||||
|
||||
self.assertEqual(1, len(emails))
|
||||
self.assertEqual('Good morning Gary!\n\ntest works :)\n',
|
||||
emails[0]['body'])
|
||||
|
Loading…
x
Reference in New Issue
Block a user