libvirt: Race condition leads to instance in error

ImageCacheManager deletes base image while image backend is copying
image to the instance path leading instance to go in the error state.

Acquired lock before removing image from cache. If libvirt is copying
image to the instance path, image cache manager won't be able to remove
it until libvirt finishes copying image completely.

Closes-Bug: 1256838
Closes-Bug: 1470437
Co-Authored-By: Michael Still <mikal@stillhq.com>
Depends-On: I337ce28e2fc516c91bec61ca3639ebff0029ad49
Change-Id: I376cc951922c338669fdf3f83da83e0d3cea1532
This commit is contained in:
Ankit Agrawal 2015-08-10 16:27:57 +10:00
parent 0d8a3d90fb
commit ec9d5e375e
9 changed files with 88 additions and 31 deletions

View File

@ -246,3 +246,6 @@ ploop: CommandFilter, ploop, root
# nova/virt/libvirt/utils.py: 'xend', 'status' # nova/virt/libvirt/utils.py: 'xend', 'status'
xend: CommandFilter, xend, root xend: CommandFilter, xend, root
# nova/virt/libvirt/utils.py:
touch: CommandFilter, touch, root

View File

@ -77,6 +77,10 @@ def chown(path, owner):
pass pass
def update_mtime(path):
pass
def extract_snapshot(disk_path, source_fmt, out_path, dest_fmt): def extract_snapshot(disk_path, source_fmt, out_path, dest_fmt):
files[out_path] = '' files[out_path] = ''

View File

@ -20,6 +20,7 @@ import os
import time import time
import mock import mock
from oslo_concurrency import lockutils
from oslo_concurrency import processutils from oslo_concurrency import processutils
from oslo_config import cfg from oslo_config import cfg
from oslo_log import formatters from oslo_log import formatters
@ -461,34 +462,32 @@ class ImageCacheManagerTestCase(test.NoDBTestCase):
[fname]) [fname])
self.assertEqual(image_cache_manager.corrupt_base_files, []) self.assertEqual(image_cache_manager.corrupt_base_files, [])
def test_handle_base_image_used(self): @mock.patch.object(libvirt_utils, 'update_mtime')
self.stubs.Set(libvirt_utils, 'chown', lambda x, y: None) def test_handle_base_image_used(self, mock_mtime):
img = '123' img = '123'
with self._make_base_file() as fname: with self._make_base_file() as fname:
os.utime(fname, (-1, time.time() - 3601))
image_cache_manager = imagecache.ImageCacheManager() image_cache_manager = imagecache.ImageCacheManager()
image_cache_manager.unexplained_images = [fname] image_cache_manager.unexplained_images = [fname]
image_cache_manager.used_images = {'123': (1, 0, ['banana-42'])} image_cache_manager.used_images = {'123': (1, 0, ['banana-42'])}
image_cache_manager._handle_base_image(img, fname) image_cache_manager._handle_base_image(img, fname)
mock_mtime.assert_called_once_with(fname)
self.assertEqual(image_cache_manager.unexplained_images, []) self.assertEqual(image_cache_manager.unexplained_images, [])
self.assertEqual(image_cache_manager.removable_base_files, []) self.assertEqual(image_cache_manager.removable_base_files, [])
self.assertEqual(image_cache_manager.corrupt_base_files, []) self.assertEqual(image_cache_manager.corrupt_base_files, [])
def test_handle_base_image_used_remotely(self): @mock.patch.object(libvirt_utils, 'update_mtime')
self.stubs.Set(libvirt_utils, 'chown', lambda x, y: None) def test_handle_base_image_used_remotely(self, mock_mtime):
img = '123' img = '123'
with self._make_base_file() as fname: with self._make_base_file() as fname:
os.utime(fname, (-1, time.time() - 3601))
image_cache_manager = imagecache.ImageCacheManager() image_cache_manager = imagecache.ImageCacheManager()
image_cache_manager.unexplained_images = [fname] image_cache_manager.unexplained_images = [fname]
image_cache_manager.used_images = {'123': (0, 1, ['banana-42'])} image_cache_manager.used_images = {'123': (0, 1, ['banana-42'])}
image_cache_manager._handle_base_image(img, fname) image_cache_manager._handle_base_image(img, fname)
mock_mtime.assert_called_once_with(fname)
self.assertEqual(image_cache_manager.unexplained_images, []) self.assertEqual(image_cache_manager.unexplained_images, [])
self.assertEqual(image_cache_manager.removable_base_files, []) self.assertEqual(image_cache_manager.removable_base_files, [])
self.assertEqual(image_cache_manager.corrupt_base_files, []) self.assertEqual(image_cache_manager.corrupt_base_files, [])
@ -527,9 +526,9 @@ class ImageCacheManagerTestCase(test.NoDBTestCase):
self.assertEqual(image_cache_manager.removable_base_files, []) self.assertEqual(image_cache_manager.removable_base_files, [])
self.assertEqual(image_cache_manager.corrupt_base_files, []) self.assertEqual(image_cache_manager.corrupt_base_files, [])
def test_handle_base_image_checksum_fails(self): @mock.patch.object(libvirt_utils, 'update_mtime')
def test_handle_base_image_checksum_fails(self, mock_mtime):
self.flags(checksum_base_images=True, group='libvirt') self.flags(checksum_base_images=True, group='libvirt')
self.stubs.Set(libvirt_utils, 'chown', lambda x, y: None)
img = '123' img = '123'
@ -546,12 +545,15 @@ class ImageCacheManagerTestCase(test.NoDBTestCase):
image_cache_manager.used_images = {'123': (1, 0, ['banana-42'])} image_cache_manager.used_images = {'123': (1, 0, ['banana-42'])}
image_cache_manager._handle_base_image(img, fname) image_cache_manager._handle_base_image(img, fname)
mock_mtime.assert_called_once_with(fname)
self.assertEqual(image_cache_manager.unexplained_images, []) self.assertEqual(image_cache_manager.unexplained_images, [])
self.assertEqual(image_cache_manager.removable_base_files, []) self.assertEqual(image_cache_manager.removable_base_files, [])
self.assertEqual(image_cache_manager.corrupt_base_files, self.assertEqual(image_cache_manager.corrupt_base_files,
[fname]) [fname])
def test_verify_base_images(self): @mock.patch.object(libvirt_utils, 'update_mtime')
@mock.patch.object(lockutils, 'external_lock')
def test_verify_base_images(self, mock_lock, mock_mtime):
hashed_1 = '356a192b7913b04c54574d18c28d46e6395428ab' hashed_1 = '356a192b7913b04c54574d18c28d46e6395428ab'
hashed_21 = '472b07b9fcf2c2451e8781e944bf5f77cd8457c8' hashed_21 = '472b07b9fcf2c2451e8781e944bf5f77cd8457c8'
hashed_22 = '12c6fc06c99a462375eeb3f43dfd832b08ca9e17' hashed_22 = '12c6fc06c99a462375eeb3f43dfd832b08ca9e17'
@ -607,11 +609,6 @@ class ImageCacheManagerTestCase(test.NoDBTestCase):
self.stub_out('os.path.exists', lambda x: exists(x)) self.stub_out('os.path.exists', lambda x: exists(x))
self.stubs.Set(libvirt_utils, 'chown', lambda x, y: None)
# We need to stub utime as well
self.stub_out('os.utime', lambda x, y: None)
# Fake up some instances in the instances directory # Fake up some instances in the instances directory
orig_listdir = os.listdir orig_listdir = os.listdir
@ -857,13 +854,13 @@ class ImageCacheManagerTestCase(test.NoDBTestCase):
expect_set = set(['swap_123', 'swap_456']) expect_set = set(['swap_123', 'swap_456'])
self.assertEqual(image_cache_manager.back_swap_images, expect_set) self.assertEqual(image_cache_manager.back_swap_images, expect_set)
@mock.patch.object(libvirt_utils, 'chown') @mock.patch.object(lockutils, 'external_lock')
@mock.patch.object(libvirt_utils, 'update_mtime')
@mock.patch('os.path.exists', return_value=True) @mock.patch('os.path.exists', return_value=True)
@mock.patch('os.utime')
@mock.patch('os.path.getmtime') @mock.patch('os.path.getmtime')
@mock.patch('os.remove') @mock.patch('os.remove')
def test_age_and_verify_swap_images(self, mock_remove, mock_getmtime, def test_age_and_verify_swap_images(self, mock_remove, mock_getmtime,
mock_utime, mock_exist, mock_chown): mock_exist, mock_mtime, mock_lock):
image_cache_manager = imagecache.ImageCacheManager() image_cache_manager = imagecache.ImageCacheManager()
expected_remove = set() expected_remove = set()
expected_exist = set(['swap_128', 'swap_256']) expected_exist = set(['swap_128', 'swap_256'])
@ -894,6 +891,20 @@ class ImageCacheManagerTestCase(test.NoDBTestCase):
self.assertIn('swap_128', expected_exist) self.assertIn('swap_128', expected_exist)
self.assertIn('swap_256', expected_remove) self.assertIn('swap_256', expected_remove)
@mock.patch.object(utils, 'synchronized')
@mock.patch.object(imagecache.ImageCacheManager, '_get_age_of_file',
return_value=(True, 100))
def test_lock_acquired_on_removing_old_enough_files(self, mock_get_age,
mock_synchronized):
base_file = '/tmp_age_test'
lock_path = os.path.join(CONF.instances_path, 'locks')
lock_file = os.path.split(base_file)[-1]
image_cache_manager = imagecache.ImageCacheManager()
image_cache_manager._remove_old_enough_file(
base_file, 60, remove_sig=False, remove_lock=False)
mock_synchronized.assert_called_once_with(lock_file, external=True,
lock_path=lock_path)
class VerifyChecksumTestCase(test.NoDBTestCase): class VerifyChecksumTestCase(test.NoDBTestCase):

View File

@ -25,7 +25,7 @@ from nova.virt import images
class QemuTestCase(test.NoDBTestCase): class QemuTestCase(test.NoDBTestCase):
def test_qemu_info_with_bad_path(self): def test_qemu_info_with_bad_path(self):
self.assertRaises(exception.InvalidDiskInfo, self.assertRaises(exception.DiskNotFound,
images.qemu_img_info, images.qemu_img_info,
'/path/that/does/not/exist') '/path/that/does/not/exist')

View File

@ -54,8 +54,7 @@ def qemu_img_info(path, format=None):
CONF.import_opt('images_type', 'nova.virt.libvirt.imagebackend', CONF.import_opt('images_type', 'nova.virt.libvirt.imagebackend',
group='libvirt') group='libvirt')
if not os.path.exists(path) and CONF.libvirt.images_type != 'rbd': if not os.path.exists(path) and CONF.libvirt.images_type != 'rbd':
msg = (_("Path does not exist %(path)s") % {'path': path}) raise exception.DiskNotFound(location=path)
raise exception.InvalidDiskInfo(reason=msg)
try: try:
cmd = ('env', 'LC_ALL=C', 'LANG=C', 'qemu-img', 'info', path) cmd = ('env', 'LC_ALL=C', 'LANG=C', 'qemu-img', 'info', path)

View File

@ -531,10 +531,15 @@ class Raw(Image):
else: else:
if not os.path.exists(base): if not os.path.exists(base):
prepare_template(target=base, max_size=size, *args, **kwargs) prepare_template(target=base, max_size=size, *args, **kwargs)
# NOTE(mikal): Update the mtime of the base file so the image
# cache manager knows it is in use.
libvirt_utils.update_mtime(base)
self.verify_base_size(base, size) self.verify_base_size(base, size)
if not os.path.exists(self.path): if not os.path.exists(self.path):
with fileutils.remove_path_on_error(self.path): with fileutils.remove_path_on_error(self.path):
copy_raw_image(base, self.path, size) copy_raw_image(base, self.path, size)
self.correct_format() self.correct_format()
def resize_image(self, size): def resize_image(self, size):
@ -584,6 +589,10 @@ class Qcow2(Image):
# Download the unmodified base image unless we already have a copy. # Download the unmodified base image unless we already have a copy.
if not os.path.exists(base): if not os.path.exists(base):
prepare_template(target=base, max_size=size, *args, **kwargs) prepare_template(target=base, max_size=size, *args, **kwargs)
# NOTE(ankit): Update the mtime of the base file so the image
# cache manager knows it is in use.
libvirt_utils.update_mtime(base)
self.verify_base_size(base, size) self.verify_base_size(base, size)
legacy_backing_size = None legacy_backing_size = None

View File

@ -449,10 +449,18 @@ class ImageCacheManager(imagecache.ImageCacheManager):
if not exists: if not exists:
return return
if age < maxage: lock_file = os.path.split(base_file)[-1]
LOG.info(_LI('Base or swap file too young to remove: %s'),
base_file) @utils.synchronized(lock_file, external=True,
else: lock_path=self.lock_path)
def _inner_remove_old_enough_file():
# NOTE(mikal): recheck that the file is old enough, as a new
# user of the file might have come along while we were waiting
# for the lock
exists, age = self._get_age_of_file(base_file)
if not exists or age < maxage:
return
LOG.info(_LI('Removing base or swap file: %s'), base_file) LOG.info(_LI('Removing base or swap file: %s'), base_file)
try: try:
os.remove(base_file) os.remove(base_file)
@ -466,6 +474,11 @@ class ImageCacheManager(imagecache.ImageCacheManager):
{'base_file': base_file, {'base_file': base_file,
'error': e}) 'error': e})
if age < maxage:
LOG.info(_LI('Base or swap file too young to remove: %s'),
base_file)
else:
_inner_remove_old_enough_file()
if remove_lock: if remove_lock:
try: try:
# NOTE(jichenjc) The lock file will be constructed first # NOTE(jichenjc) The lock file will be constructed first
@ -473,7 +486,6 @@ class ImageCacheManager(imagecache.ImageCacheManager):
# like nova-9e881789030568a317fad9daae82c5b1c65e0d4a # like nova-9e881789030568a317fad9daae82c5b1c65e0d4a
# or nova-03d8e206-6500-4d91-b47d-ee74897f9b4e # or nova-03d8e206-6500-4d91-b47d-ee74897f9b4e
# according to the original file name # according to the original file name
lock_file = os.path.split(base_file)[-1]
lockutils.remove_external_lock_file(lock_file, lockutils.remove_external_lock_file(lock_file,
lock_file_prefix='nova-', lock_path=self.lock_path) lock_file_prefix='nova-', lock_path=self.lock_path)
except OSError as e: except OSError as e:
@ -562,8 +574,7 @@ class ImageCacheManager(imagecache.ImageCacheManager):
{'id': img_id, {'id': img_id,
'base_file': base_file}) 'base_file': base_file})
if os.path.exists(base_file): if os.path.exists(base_file):
libvirt_utils.chown(base_file, os.getuid()) libvirt_utils.update_mtime(base_file)
os.utime(base_file, None)
def _age_and_verify_swap_images(self, context, base_dir): def _age_and_verify_swap_images(self, context, base_dir):
LOG.debug('Verify swap images') LOG.debug('Verify swap images')
@ -571,8 +582,7 @@ class ImageCacheManager(imagecache.ImageCacheManager):
for ent in self.back_swap_images: for ent in self.back_swap_images:
base_file = os.path.join(base_dir, ent) base_file = os.path.join(base_dir, ent)
if ent in self.used_swap_images and os.path.exists(base_file): if ent in self.used_swap_images and os.path.exists(base_file):
libvirt_utils.chown(base_file, os.getuid()) libvirt_utils.update_mtime(base_file)
os.utime(base_file, None)
elif self.remove_unused_base_images: elif self.remove_unused_base_images:
self._remove_swap_file(base_file) self._remove_swap_file(base_file)

View File

@ -248,6 +248,14 @@ def chown(path, owner):
execute('chown', owner, path, run_as_root=True) execute('chown', owner, path, run_as_root=True)
def update_mtime(path):
"""Touch a file without being the owner.
:param path: File bump the mtime on
"""
execute('touch', '-c', path, run_as_root=True)
def _id_map_to_config(id_map): def _id_map_to_config(id_map):
return "%s:%s:%s" % (id_map.start, id_map.target, id_map.count) return "%s:%s:%s" % (id_map.start, id_map.target, id_map.count)

View File

@ -0,0 +1,13 @@
---
upgrade:
- Upgrade the rootwrap configuration for the compute service,
so that patches requiring new rootwrap configuration can be
tested with grenade.
fixes:
- In a race condition if base image is deleted by ImageCacheManager
while imagebackend is copying the image to instance path, then the
instance goes in to error state. In this case when libvirt has
changed the base file ownership to libvirt-qemu while imagebackend
is copying the image, then we get permission denied error on updating
the file access time using os.utime. Fixed this issue by updating the
base file access time with root user privileges using 'touch' command.