Source code for elasticluster.providers.openstack

#!/usr/bin/env python
# -*- coding: utf-8 -*-#
#
# Copyright (C) 2013, 2015, 2018-2019 University of Zurich.
# Copyright (C) 2020 ETH Zurich.
# Copyright (C) 2022 Google LLC.
#
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

__docformat__ = 'reStructuredText'
__author__ = ', '.join([
    'Antonio Messina <antonio.s.messina@gmail.com>',
    'Riccardo Murri <riccardo.murri@gmail.com>'
    ])

# stdlib imports
from builtins import range
from builtins import object
import os
import threading
from warnings import warn
from time import sleep

# External modules

# handle missing OpenStack libraries in Python 2.6: delay ImportError until actually
# used but raise a warning in the meantime; since we pinned the 2.6 dependencies
# to the pre-3.0.0 release, everything *should* work with just the "nova" API.
class _Unavailable(object):
    def __init__(self, missing):
        self.__missing = missing
    def Client(self, *args, **kwargs):
        warn("Trying to initialize `{module}` which is not available."
             " A placeholder object will be used instead, but it will raise"
             " `ImportError` later if there is any actual attempt at using it."
             .format(module=self.__missing),
             ImportWarning)
        class _Unavailable(object):
            def __init__(self, missing):
                self.__missing = missing
            def __getattr__(self, name):
                return self
            def __call__(self, *args, **kwargs):
                raise ImportError(
                    "Trying to actually use client class from module `{module}`"
                    " which could not be imported. Aborting."
                    .format(module=self.__missing))
        return _Unavailable(self.__missing)

import keystoneauth1
try:
    from glanceclient import client as glance_client
except ImportError:
    glance_client = _Unavailable('python-glanceclient')
try:
    from neutronclient.v2_0 import client as neutron_client
    from neutronclient.common.exceptions import BadRequest as BadNeutronRequest
except ImportError:
    neutron_client = _Unavailable('python-neutronclient')
    class BadNeutronRequest(Exception):
        """Placeholder to avoid syntax errors."""
        pass
from cinderclient import client as cinder_client
from novaclient import client as nova_client
from novaclient.exceptions import NotFound

from paramiko import DSSKey, RSAKey, PasswordRequiredException
from paramiko.ssh_exception import SSHException

# Elasticluster imports
from elasticluster import log
from elasticluster.utils import fingerprint_str, memoize
from elasticluster.providers import AbstractCloudProvider
from elasticluster.exceptions import (
    ConfigurationError,
    FlavorError,
    ImageError,
    InstanceNotFoundError,
    KeypairError,
    SecurityGroupError,
)


# these defaults should be kept in sync w/ `conf.py`
DEFAULT_OS_COMPUTE_API_VERSION='2'
DEFAULT_OS_IDENTITY_API_VERSION='3'
DEFAULT_OS_IMAGE_API_VERSION='2'
DEFAULT_OS_NETWORK_API_VERSION='2.0'  # no choice as of Aug. 2017
DEFAULT_OS_VOLUME_API_VERSION='3'

_NO_DEFAULT = object()
"""
Special value used in `_get_os_config_value` to indicate that a
value *must* be provided.
"""



[docs]class OpenStackCloudProvider(AbstractCloudProvider):
    """
    This implementation of
    :py:class:`elasticluster.providers.AbstractCloudProvider` uses the
    OpenStack native python bindings to connect to OpenStack clouds
    and manage instances.

    :param str username: username of the keystone user
    :param str password: password of the keystone user
    :param str project_name: name of the project to use
    :param str user_domain_name: name of the user domain to use
    :param str project_domain_name: name of the project domain to use
    :param str auth_url: url of keystone endpoint
    :param str region: OpenStack region to use
    :param str storage_path: path to store temporary data
    :param bool request_floating_ip: Whether ip are assigned automatically
                                    `True` or floating ips have to be
                                    assigned manually `False`
    :param identity_api_version: What version of the Keystone API to use.
        Valid values are the strings `"v2"` or `"v3"`,
        or `None` (default, meaning try v3 first and fall-back to v2).
    :param cacert: Path to CA certificate bundle (for verifying HTTPS sessions)
        or ``None`` to use the systems' default.
    :param bool use_anti_affinity_groups:
        Place nodes of the a cluster in the same anti-affinity group.

    Parameters *username*, *password*, *user_domain_name*,
    *project_name*, *project_domain_name*, and *region_name* will be
    taken from the environment if not provided.  Similarly,
    environmental variables can be used to set values for the
    preferred version of identity, compute, image, network, and volume
    API to use.

    In all these cases, any value explicitly passed to the constructor
    takes precedence over the corresponding environmental variable,
    which in turn takes precedence over the default value in the class
    (if any).
    """

    __node_start_lock = threading.Lock()
    """
    Lock used for node startup.
    """

    _aaf_groups = {}
    """
    Provider-wide map of cluster names to Anti-Affinity groups in use.
    """

    OUT_OF_CAPACITY_ERRMSG = 'There are not enough hosts available.'
    """
    Fault message from OpenStack API indicating AAF group full.
    """

    def __init__(self,
                 username=None,
                 password=None,
                 project_name=None,
                 auth_url=None,
                 user_domain_name="default", project_domain_name="default",
                 region_name=None,
                 availability_zone=None,
                 storage_path=None,
                 compute_api_version=DEFAULT_OS_COMPUTE_API_VERSION,
                 image_api_version=DEFAULT_OS_IMAGE_API_VERSION,
                 network_api_version=DEFAULT_OS_NETWORK_API_VERSION,
                 volume_api_version=DEFAULT_OS_VOLUME_API_VERSION,
                 # this can be auto-detected
                 identity_api_version=None,
                 # this is deprecated in favor of `compute_api_version`
                 nova_api_version=None,
                 cacert=None,  # keep in sync w/ default in novaclient.Client()
                 use_anti_affinity_groups=False,
                 request_floating_ip=None,  ## DEPRECATED, will be removed
                 build_timeout=30,
    ):
        # OpenStack connection params
        self._os_auth_url = self._get_os_config_value('auth URL', auth_url, ['OS_AUTH_URL']).rstrip('/')
        self._os_cacert = self._get_os_config_value('cacert', cacert, ['OS_CACERT'], default=None)
        self._os_username = self._get_os_config_value('user name', username, ['OS_USERNAME'])
        self._os_user_domain_name = self._get_os_config_value('user domain name', user_domain_name, ['OS_USER_DOMAIN_NAME'], 'default')
        self._os_password = self._get_os_config_value('password', password, ['OS_PASSWORD'])
        self._os_tenant_name = self._get_os_config_value('project name', project_name, ['OS_PROJECT_NAME', 'OS_TENANT_NAME'])
        self._os_project_domain_name = self._get_os_config_value('project domain name', project_domain_name, ['OS_PROJECT_DOMAIN_NAME'], 'default')
        self._os_region_name = self._get_os_config_value('region_name', region_name, ['OS_REGION_NAME'], '')
        self.availability_zone = availability_zone
        self.build_timeout = build_timeout

        # the OpenStack versioning mess
        if nova_api_version is not None:
            warn('Deprecated parameter `nova_api_version` given to OpenStackProvider;'
                 ' use `compute_api_version` instead', DeprecationWarning)
            compute_api_version = nova_api_version
        self._compute_api_version = compute_api_version
        self._image_api_version = image_api_version
        self._network_api_version = network_api_version
        os_network_api_version = os.getenv('OS_NETWORK_API_VERSION', None)
        if  (os_network_api_version
             and os_network_api_version != DEFAULT_OS_NETWORK_API_VERSION):
            warn("Environment variable OS_NETWORK_API_VERSION set,"
                 " but ElastiCluster does not support selecting"
                 " the OpenStack Networking API (Neutron) version yet.",
                 UserWarning)
        self._volume_api_version = volume_api_version
        self._identity_api_version = identity_api_version or self.__detect_os_identity_api_version()

        # these will be initialized later by `_init_os_api()`
        self.nova_client = None
        self.neutron_client = None
        self.glance_client = None
        self.cinder_client = None

        # local state
        self._instances = {}
        self._cached_instances = {}

        self.use_anti_affinity_groups = use_anti_affinity_groups

        if request_floating_ip is not None:
            warn('Deprecated parameter `request_floating_ip` given'
                 ' to OpenStackProvider; place it in the cluster'
                 ' or node configuration instead.', DeprecationWarning)
        self._request_floating_ip_default = request_floating_ip

[docs]    def to_vars_dict(self):
        """
        Return local state which is relevant for the cluster setup process.
        """
        return {
            # connection data (= what is in the "openrc" file)
            'os_auth_url':             self._os_auth_url,
            'os_cacert':               (self._os_cacert or ''),
            'os_password':             self._os_password,
            'os_project_domain_name':  self._os_project_domain_name,
            'os_region_name':          self._os_region_name,
            'os_tenant_name':          self._os_tenant_name,
            'os_user_domain_name':     self._os_user_domain_name,
            'os_username':             self._os_username,
            # API versioning
            'os_compute_api_version':   self._compute_api_version,
            'os_identity_api_version':  self._identity_api_version,
            'os_image_api_version':     self._image_api_version,
            'os_network_api_version':   self._network_api_version,
            'os_volume_api_version':    self._volume_api_version,
        }

    @staticmethod
    def _get_os_config_value(thing, value, varnames, default=_NO_DEFAULT):
        assert varnames, "List of env variable names cannot be empty"
        for varname in varnames:
            env_value = os.getenv(varname, None)
            if env_value is not None:
                if value is not None and env_value != value:
                    warn("OpenStack {thing} is present both in the environment"
                         " and the config file. Environment variable {varname}"
                         " takes precedence, but this may change in the future."
                         .format(thing=thing, varname=varname),
                         FutureWarning)
                else:
                    log.debug('OpenStack %s taken from env variable %s',
                              thing, varname)
                return env_value
        if value:
            return value
        elif default is _NO_DEFAULT:
            # first variable name is preferred; others are for backwards-compatibility only
            raise RuntimeError(
                "There is no default value for OpenStack {0};"
                " please specify one in the config file"
                " or using environment variable {1}."
                .format(thing, varnames[0]))
        else:
            return default

    def _init_os_api(self):
        """
        Initialise client objects for talking to OpenStack API.

        This is in a separate function so to be called by ``__init__``
        and ``__setstate__``.
        """
        if not self.nova_client:
            log.debug("Initializing OpenStack API clients:"
                      " OS_AUTH_URL='%s'"
                      " OS_USERNAME='%s'"
                      " OS_USER_DOMAIN_NAME='%s'"
                      " OS_PROJECT_NAME='%s'"
                      " OS_PROJECT_DOMAIN_NAME='%s'"
                      " OS_REGION_NAME='%s'"
                      " OS_CACERT='%s'"
                      "", self._os_auth_url,
                      self._os_username, self._os_user_domain_name,
                      self._os_tenant_name, self._os_project_domain_name,
                      self._os_region_name,
                      (self._os_cacert or ''))
            sess = self.__init_keystone_session()
            log.debug("Creating OpenStack Compute API (Nova) v%s client ...", self._compute_api_version)
            self.nova_client = nova_client.Client(
                self._compute_api_version, session=sess,
                region_name=self._os_region_name,
                cacert=self._os_cacert)
            log.debug("Creating OpenStack Network API (Neutron) client ...")
            self.neutron_client = neutron_client.Client(
                #self._network_api_version,  ## doesn't work as of Neutron Client 2 :-(
                session=sess, region_name=self._os_region_name,
                ca_cert=self._os_cacert)
            # FIXME: Glance's `Client` class does not take an explicit
            # `cacert` parameter, instead it relies on the `session`
            # argument being "A keystoneauth1 session that should be
            # used for transport" -- I presume this means that
            # `cacert` only needs to be set there.  Is this true of
            # other OpenStack client classes as well?
            log.debug("Creating OpenStack Image API (Glance) v%s client ...", self._image_api_version)
            self.glance_client = glance_client.Client(
                self._image_api_version, session=sess,
                region_name=self._os_region_name)
            log.debug("Creating OpenStack Volume API (Cinder) v%s client ...", self._volume_api_version)
            self.cinder_client = cinder_client.Client(
                self._volume_api_version, session=sess,
                region_name=self._os_region_name,
                cacert=self._os_cacert)

    def __init_keystone_session(self):
        """Create and return a Keystone session object."""
        api = self._identity_api_version  # for readability
        tried = []
        if api in ['3', None]:
            sess = self.__init_keystone_session_v3(check=(api is None))
            tried.append('v3')
            if sess:
                return sess
        if api in ['2', None]:
            sess = self.__init_keystone_session_v2(check=(api is None))
            tried.append('v2')
            if sess:
                return sess
        raise RuntimeError(
            "Cannot establish Keystone session (tried: {0})."
            .format(', '.join(tried)))

    def __init_keystone_session_v2(self, check=False):
        """Create and return a session object using Keystone API v2."""
        from keystoneauth1 import loading as keystone_v2
        loader = keystone_v2.get_plugin_loader('password')
        auth = loader.load_from_options(
            auth_url=self._os_auth_url,
            username=self._os_username,
            password=self._os_password,
            project_name=self._os_tenant_name,
        )
        sess = keystoneauth1.session.Session(auth=auth, verify=self._os_cacert)
        if check:
            log.debug("Checking that Keystone API v2 session works...")
            try:
                # if session is invalid, the following will raise some exception
                nova = nova_client.Client(self._compute_api_version, session=sess, cacert=self._os_cacert)
                nova.flavors.list()
            except keystoneauth1.exceptions.NotFound as err:
                log.warning("Creating Keystone v2 session failed: %s", err)
                return None
            except keystoneauth1.exceptions.ClientException as err:
                log.error("OpenStack server rejected request (likely configuration error?): %s", err)
                return None  # FIXME: should we be raising an error instead?
        # if we got to this point, v2 session is valid
        log.info("Using Keystone API v2 session to authenticate to OpenStack")
        return sess

    def __init_keystone_session_v3(self, check=False):
        """
        Return a new session object, created using Keystone API v3.

        .. note::

          Note that the only supported authN method is password authentication;
          token or other plug-ins are not currently supported.
        """
        try:
            # may fail on Python 2.6?
            from keystoneauth1.identity import v3 as keystone_v3
        except ImportError:
            log.warning("Cannot load Keystone API v3 library.")
            return None
        auth = keystone_v3.Password(
            auth_url=self._os_auth_url,
            username=self._os_username,
            password=self._os_password,
            user_domain_name=self._os_user_domain_name,
            project_domain_name=self._os_project_domain_name,
            project_name=self._os_tenant_name,
        )
        sess = keystoneauth1.session.Session(auth=auth, verify=self._os_cacert)
        if check:
            log.debug("Checking that Keystone API v3 session works...")
            try:
                # if session is invalid, the following will raise some exception
                nova = nova_client.Client(self._compute_api_version, session=sess)
                nova.flavors.list()
            except keystoneauth1.exceptions.NotFound as err:
                log.warning("Creating Keystone v3 session failed: %s", err)
                return None
            except keystoneauth1.exceptions.ClientException as err:
                log.error("OpenStack server rejected request (likely configuration error?): %s", err)
                return None  # FIXME: should we be raising an error instead?
        # if we got to this point, v3 session is valid
        log.info("Using Keystone API v3 session to authenticate to OpenStack")
        return sess


    def __detect_os_identity_api_version(self):
        """
        Return preferred OpenStack Identity API version (either one of the two strings ``'2'`` or ``'3'``) or ``None``.

        The following auto-detection strategies are tried (in this order):

        #. Read the environmental variable `OS_IDENTITY_API_VERSION` and check if its value is one of the two strings ``'2'`` or ``'3'``;
        #. Check if a version tag like ``/v3`` or ``/v2.0`` ends the OpenStack auth URL.

        If none of the above worked, return ``None``.

        For more information on ``OS_IDENTITY_API_VERSION``, please see
        `<https://docs.openstack.org/developer/python-openstackclient/authentication.html>`_.
        """
        ver = os.getenv('OS_IDENTITY_API_VERSION', '')
        if ver == '3':
            log.debug(
                "Using OpenStack Identity API v3"
                " because of environmental variable setting `OS_IDENTITY_API_VERSION=3`")
            return '3'
        elif ver == '2' or ver.startswith('2.'):
            log.debug(
                "Using OpenStack Identity API v2"
                " because of environmental variable setting `OS_IDENTITY_API_VERSION=2`")
            return '2'
        elif self._os_auth_url.endswith('/v3'):
            log.debug(
                "Using OpenStack Identity API v3 because of `/v3` ending in auth URL;"
                " set environmental variable OS_IDENTITY_API_VERSION to force use of Identity API v2 instead.")
            return '3'
        elif self._os_auth_url.endswith('/v2.0'):
            log.debug(
                "Using OpenStack Identity API v2 because of `/v2.0` ending in auth URL;"
                " set environmental variable OS_IDENTITY_API_VERSION to force use of Identity API v3 instead.")
            return '2'
        else:
            # auto-detection failed, need to probe
            return None


[docs]    def start_instance(self, key_name, public_key_path, private_key_path,
                       security_group, flavor, image_id, image_userdata,
                       cluster_name, username=None, node_name=None, **kwargs):
        """Starts a new instance on the cloud using the given properties.
        The following tasks are done to start an instance:

        * establish a connection to the cloud web service
        * check ssh keypair and upload it if it does not yet exist. This is
          a locked process, since this function might be called in multiple
          threads and we only want the key to be stored once.
        * check if the security group exists
        * run the instance with the given properties

        :param str key_name: name of the ssh key to connect
        :param str public_key_path: path to ssh public key
        :param str private_key_path: path to ssh private key
        :param str security_group: firewall rule definition to apply on the
                                   instance
        :param str flavor: machine type to use for the instance
        :param str image_id: image type (os) to use for the instance
        :param str image_userdata: command to execute after startup
        :param str username: username for the given ssh key, default None

        :return: str - instance id of the started instance
        """
        self._init_os_api()

        max_wait = int(kwargs.get('max_wait', 300))

        vm_start_args = {}

        if self.availability_zone:
            log.debug("Starting node `%s` in availability zone `%s`.",
                      node_name, self.availability_zone)
            vm_start_args['availability_zone'] = self.availability_zone

        log.debug("Checking keypair `%s` ...", key_name)
        with OpenStackCloudProvider.__node_start_lock:
            self._check_keypair(key_name, public_key_path, private_key_path)
        vm_start_args['key_name'] = key_name

        security_groups = [sg.strip() for sg in security_group.split(',')]
        self._check_security_groups(security_groups)
        vm_start_args['security_groups'] = security_groups


        # Check if the image id is present.
        if image_id not in [img.id for img in self._get_images()]:
            raise ImageError(
                    "No image found with ID `{0}` in project `{1}` of cloud {2}"
                    .format(image_id, self._os_tenant_name, self._os_auth_url))
        vm_start_args['userdata'] = image_userdata

        # Check if the flavor exists
        flavors = [fl for fl in self._get_flavors() if fl.name == flavor]
        if not flavors:
            raise FlavorError(
                "No flavor found with name `{0}` in project `{1}` of cloud {2}"
                .format(flavor, self._os_tenant_name, self._os_auth_url))
        flavor = flavors[0]

        availability_zone = kwargs.pop('availability_zone','')
        vm_start_args['availability_zone']=availability_zone

        network_ids = [net_id.strip()
                       for net_id in kwargs.pop('network_ids', '').split(',')]
        if network_ids:
            nics = [{'net-id': net_id, 'v4-fixed-ip': ''}
                    for net_id in network_ids ]
            log.debug("Specifying networks for node %s: %s",
                      node_name, ', '.join([nic['net-id'] for nic in nics]))
        else:
            nics = None
        vm_start_args['nics'] = nics

        if 'boot_disk_size' in kwargs:
            # check if the backing volume is already there
            volume_name = '{name}-{id}'.format(name=node_name, id=image_id)
            if volume_name in self._get_volumes():
                raise ImageError(
                    "Volume `{0}` already exists in project `{1}` of cloud {2}"
                    .format(volume_name, self._os_tenant_name, self._os_auth_url))

            log.info('Creating volume `%s` to use as VM disk ...', volume_name)
            try:
                bds = int(kwargs['boot_disk_size'])
                if bds < 1:
                    raise ValueError('non-positive int')
            except (ValueError, TypeError):
                raise ConfigurationError(
                    "Invalid `boot_disk_size` specified:"
                    " should be a positive integer, got {0} instead"
                    .format(kwargs['boot_disk_size']))
            volume = self.cinder_client.volumes.create(
                size=bds, name=volume_name, imageRef=image_id,
                volume_type=kwargs.pop('boot_disk_type'))

            # wait for volume to come up
            waited = 0
            while waited < max_wait:
                volumes = self._get_volumes()
                if (volume_name in volumes
                    and volumes[volume_name].status == 'available'):
                        break
                sleep(1)  # FIXME: hard-coded waiting time
                waited += 1
            if waited >= max_wait:
                raise RuntimeError(
                    "Volume `{0}` (ID: {1}) didn't come up in {2:d} seconds"
                    .format(volume_name, volume.id, max_wait))

            # ok, use volume as VM disk
            vm_start_args['block_device_mapping'] = {
                # FIXME: is it possible that `vda` is not the boot disk? e.g. if
                # a non-paravirtualized kernel is being used?  should we allow
                # to set the boot device as an image parameter?
                'vda': ('{id}:::{delete_on_terminate}'
                        .format(id=volume.id, delete_on_terminate=1)),
            }

        result = None
        retry = 2  # FIXME: should this be configurable?
        while retry > 0:
            retry -= 1
            if self.use_anti_affinity_groups:
                # create a server anti-affinity group, spawn hosts in the
                # group until it's full and then create a new group
                aaf_group = self._get_aaf_group(cluster_name)
                group_id, group_name, req_handle = aaf_group.get()
                vm_start_args['scheduler_hints'] = { 'group' : group_id }
                in_group_msg = (' in group {0}'.format(group_name))
            else:
                # still need to define this for logging
                in_group_msg = ''
            # due to some `nova_client.servers.create()` implementation weirdness,
            # the first three args need to be spelt out explicitly and cannot be
            # conflated into `**vm_start_args`
            vm = self.nova_client.servers.create(node_name, image_id, flavor, **vm_start_args)
            log.debug(
                "Attempting to start VM instance `%s` (%s)%s ...",
                vm.name, vm.id, in_group_msg)

            self._wait_for_status(vm, ["ACTIVE", "ERROR"], self.build_timeout)
            if vm.status == 'ACTIVE':
                log.debug("Started VM instance `%s` (%s)", vm.name, vm.id)
                result = { 'instance_id': vm.id }
                if self.use_anti_affinity_groups:
                    result['anti_affinity_group_id'] = group_id
                break  # out of `while retry > 0:`
            elif vm.status == 'ERROR':
                if (self.use_anti_affinity_groups
                    # FIXME: is there a better way to determine the
                    # cause of the error than parsing the fault message?
                    and self._get_fault_message(vm) == self.OUT_OF_CAPACITY_ERRMSG):
                    log.debug(
                        "Got 'not enough hosts available' error message;"
                        " assuming group %s(%s) is full and retrying"
                        " with new anti-affinity group.",
                        group_name, group_id)
                    aaf_group.full(req_handle)
                log.warning(
                    ("Could not start VM instance `%s` (%s)%s: %s"
                     " Deleting it."),
                    vm.name, vm.id, in_group_msg,
                    self._get_fault_message(vm) or 'unspecified error')
                self.nova_client.servers.delete(vm.id)
            else: # VM still building?
                log.warning(
                    ("VM instance `%s` (%s) not yet ACTIVE after %d-seconds timeout."
                     " Deleting it."),
                    vm.name, vm.id, self.build_timeout)
                self.nova_client.servers.delete(vm.id)

        # allocate and attach a floating IP, if requested
        request_floating_ip = kwargs.get(
            'request_floating_ip',
            self._request_floating_ip_default)

        if request_floating_ip:

            # wait for server to come up (otherwise floating IP can't be associated)
            log.info("Waiting for VM instance `%s` (%s) to come up ...", node_name, vm.id)
            waited = 0
            while waited < max_wait:
                if vm.status == 'ACTIVE':
                    break
                if vm.status == 'ERROR':
                    raise RuntimeError(
                        "Failed to start VM {0}:"
                        " OpenStack scheduling error."
                        .format(vm.id))
                vm = self.nova_client.servers.get(vm.id)
                # FIXME: Configurable poll interval
                sleep(3)
                waited += 3
            if waited >= max_wait:
                raise RuntimeError(
                    "VM {0} didn't come up in {1:d} seconds"
                    .format(vm.id, max_wait))

            # We need to list the floating IPs for this instance
            try:
                # python-novaclient <8.0.0
                floating_ips = [ip for ip in self.nova_client.floating_ips.list()
                                if ip.instance_id == vm.id]
            except AttributeError:
                floating_ips = (
                    self.neutron_client
                    .list_floatingips(id=vm.id)
                    .get('floating_ips', []))
            # allocate new floating IP if none given
            if not floating_ips:
                if 'floating_network_id' in kwargs:
                    floating_networks = [kwargs.pop('floating_network_id')]
                else:
                    floating_networks = network_ids[:]
                ip_addr = self._allocate_address(vm, floating_networks)
                log.debug("VM `%s` was allocated floating IP: %r", vm.id, ip_addr)
            else:
                log.debug("VM `%s` already allocated floating IPs: %r", vm.id, floating_ips)

        self._instances[vm.id] = vm

        return result

[docs]    def stop_instance(self, node):
        """
        Destroy a VM.

        :param Node node: A `Node`:class: instance.
        """
        self._init_os_api()
        instance = self._load_instance(node.instance_id)
        instance.delete()
        anti_affinity_group = node['extra'].get('anti_affinity_group_id', None)
        if anti_affinity_group:
            # FIXME: OpenStack happily deletes a server group even if
            # there are servers in it, so the current code has a flaw
            # in that we can delete an entire server group by removing
            # only one node -- so resizing down then up may result in
            # nodes being incorrectly distributed w.r.t. to affinity.
            try:
                self.nova_client.server_groups.delete(anti_affinity_group)
            except NotFound:
                pass
        return self._instances.pop(node.instance_id, None)

[docs]    def resume_instance(self, instance_state):
        raise NotImplementedError("This provider does not (yet) support pause / resume logic.")

[docs]    def pause_instance(self, instance_id):
        raise NotImplementedError("This provider does not (yet) support pause / resume logic.")

[docs]    def get_ips(self, instance_id):
        """Retrieves all IP addresses associated to a given instance.

        :return: tuple (IPs)
        """
        self._init_os_api()
        instance = self._load_instance(instance_id)
        try:
            ip_addrs = set([self.floating_ip])
        except AttributeError:
            ip_addrs = set([])
        for ip_addr in sum(instance.networks.values(), []):
            ip_addrs.add(ip_addr)
        log.debug("VM `%s` has IP addresses %r", instance_id, ip_addrs)
        return list(ip_addrs)

[docs]    def is_instance_running(self, instance_id):
        """Checks if the instance is up and running.

        :param str instance_id: instance identifier

        :return: bool - True if running, False otherwise
        """
        self._init_os_api()
        # Here, it's always better if we update the instance.
        instance = self._load_instance(instance_id, force_reload=True)
        return instance.status == 'ACTIVE'

    # Protected methods

    def _wait_for_status(self, vm, accepted_statuses, attempts):
        for i in range(attempts):
            vm.get()
            if vm.status in accepted_statuses:
                break
            sleep(1)

    def _get_aaf_group(self, cluster):
        with self.__node_start_lock:
            if cluster not in self._aaf_groups:
                self._aaf_groups[cluster] = self.AntiAffinityGroup(
                    self.nova_client, ('elasticluster.{}'.format(cluster)))
        return self._aaf_groups[cluster]

[docs]    class AntiAffinityGroup(object):
        """
        Interface to OpenStack's Anti-Affinity Groups.

        A single instance of this class should manage all the AAf
        groups for a cluster.  Use like this:

        1. Initialise class with a unique string; as the list of AAF
           groups used by a cluster is not persisted, the unique
           string is used as a marker for recovering the managed
           groups from OpenStack's list.

        2. Prior to starting a node ("creating a server" in
           OpenStack's language), call :meth:`get` which returns the
           group ID and a a *request handle*.

        3. If node creation fails because the AAF group has no more
           slots available, then call :meth:`full` passing the
           request handle and try again.

        This approach is needed because we cannot probe for 'available
        slots' in a AAF group: the only way to find out if a server
        can be added to an AAF group is to actually try to start it.

        The code is thread-safe and can be
        called concurrently; requests handles are the mechanism used
        to make sure that new groups are created only when actually
        needed.
        """

        _lock = threading.RLock()
        """
        Class-shared re-entrant lock to ensure only one thread uses the
        `server_groups.create()` call.
        """

        def __init__(self, nova_client, prefix):
            self._do = nova_client.server_groups
            self._prefix = prefix
            self._req_token = 0
            self._reset_token = 0
            groups = self.__list()
            if not groups:
                self._index = 0
                self.__new()
            else:
                # determine highest-numbered AAf group
                self._index = 0
                for group in groups:
                    tail = group.name.split('.')[-1]
                    try:
                        i = int(tail)
                        if i > self._index:
                            self._index = i
                            self._group = group
                    except ValueError:
                        log.warning(
                            "Ignoring server group `%s` (%s),"
                            " as it does not seem to have been created by ElastiCluster:"
                            " tail part `%s` is not a number.",
                            group.name, group.id, tail)

[docs]        def delete_all(self):
            """
            Delete all anti-affinity groups with the given prefix.
            """
            group_ids = { group.name:group.id for group in self.__list() }
            # in principle, `self._list()` can return groups that
            # start with the given prefix but were not created by
            # ElastiCluster (see above) -- so delete only the groups
            # that match our `prefix.index` pattern (even in this case
            # we could be deleting too much, but there's no notion of
            # the "creator" of a group)
            for index in range(self._index):
                name = self.__name(index)
                try:
                    group_id = group_ids[name]
                    self._do.delete(group_id)
                except KeyError:
                    continue
                except Exception as err:  # pylint: disable=broad-except
                    log.info(
                        "Ignoring error deleting group `%s` (%s): %s",
                        name, group_id, err)
                    continue

[docs]        def full(self, req_handle):
            """
            Signal that the group associated to the given request cannot

            If the group is still in active use, force next call to
            `.get()` to create a new group; otherwise, this is a no-op.
            """
            if req_handle >= self._reset_token:
                with self._lock:
                    self._req_token += 1
                    self.__new()
                    self._reset_token = self._req_token

[docs]        def get(self):
            """
            Return current AAF group ID and name.
            """
            with self._lock:
                self._req_token += 1
            return self._group.id, self._group.name, self._req_token

        def __list(self):
            """
            Return list of anti-affinity groups matching the given prefix.
            """
            return [
                group for group in self._do.list()
                if (group.name.startswith(self._prefix)
                    # ensure `group.name.split('.')` has >0 elements
                    and '.' in group.name
                    and
                    # depending on the Nova API version, we might
                    # get a response with `group.policy` (a string)
                    # or `group.policies` (list of str) attributes
                    ('anti-affinity' in getattr(group, 'policies', [])
                     or ('anti-affinity' == getattr(group, 'policy', ''))))
            ]

        def __name(self, index=None):
            """
            Return name of group with the given prefix and index.

            If optional argument *index* is not given, the current
            value of `self._index` is used. Value for the prefix
            always comes from the prefix specified at construction time.
            """
            return (
                '{prefix}.{index}'
                .format(
                    prefix=self._prefix,
                    index=(index if index is not None else self._index),
                    ))

        def __new(self):
            """
            Create new anti-affinity group.
            """
            # needs re-entrant lock because of use in `self.full()`
            with self._lock:
                self._index += 1
                self._group = self._do.create(
                    name=self.__name(), policies='anti-affinity')

    def _check_keypair(self, name, public_key_path, private_key_path):
        """First checks if the keypair is valid, then checks if the keypair
        is registered with on the cloud. If not the keypair is added to the
        users ssh keys.

        :param str name: name of the ssh key
        :param str public_key_path: path to the ssh public key file
        :param str private_key_path: path to the ssh private key file

        :raises: `KeypairError` if key is not a valid RSA or DSA key,
                 the key could not be uploaded or the fingerprint does not
                 match to the one uploaded to the cloud.
        """
        self._init_os_api()
        # Read key. We do it as first thing because we need it either
        # way, to check the fingerprint of the remote keypair if it
        # exists already, or to create a new keypair.
        pkey = None
        try:
            pkey = DSSKey.from_private_key_file(private_key_path)
        except PasswordRequiredException:
            warn("Unable to check key file `{0}` because it is encrypted with a "
                 "password. Please, ensure that you added it to the SSH agent "
                 "with `ssh-add {1}`"
                 .format(private_key_path, private_key_path))
        except SSHException:
            try:
                pkey = RSAKey.from_private_key_file(private_key_path)
            except PasswordRequiredException:
                warn("Unable to check key file `{0}` because it is encrypted with a "
                     "password. Please, ensure that you added it to the SSH agent "
                     "with `ssh-add {1}`"
                     .format(private_key_path, private_key_path))
            except SSHException:
                raise KeypairError('File `%s` is neither a valid DSA key '
                                   'or RSA key.' % private_key_path)

        try:
            # Check if a keypair `name` exists on the cloud.
            keypair = self.nova_client.keypairs.get(name)

            # Check if it has the correct keypair, but only if we can read the local key
            if pkey:
                pkey_fingerprint = fingerprint_str(pkey)
                if pkey_fingerprint != keypair.fingerprint:
                    raise KeypairError(
                        "Keypair `%s` is present but has "
                        "different fingerprint. Aborting!" % name)
            else:
                warn("Unable to check if the keypair is using the correct key.")
        except NotFound:
            log.warning(
                "Keypair `%s` not found on resource `%s`, Creating a new one",
                name, self._os_auth_url)

            # Create a new keypair
            with open(os.path.expanduser(public_key_path)) as f:
                key_material = f.read()
                try:
                    self.nova_client.keypairs.create(name, key_material)
                except Exception as ex:
                    log.error(
                        "Could not import key `%s` with name `%s` to `%s`",
                        name, public_key_path, self._os_auth_url)
                    raise KeypairError(
                        "could not create keypair `%s`: %s" % (name, ex))


    def _check_security_groups(self, names):
        """
        Raise an exception if any of the named security groups does not exist.

        :param List[str] groups: List of security group names
        :raises: `SecurityGroupError` if group does not exist
        """
        self._init_os_api()
        log.debug("Checking existence of security group(s) %s ...", names)
        try:
            # python-novaclient < 8.0.0
            security_groups = self.nova_client.security_groups.list()
            existing = set(sg.name for sg in security_groups)
        except AttributeError:
            security_groups = self.neutron_client.list_security_groups()['security_groups']
            existing = set(sg[u'name'] for sg in security_groups)

        # TODO: We should be able to create the security group if it
        # doesn't exist and at least add a rule to accept ssh access.
        # Also, we should be able to add new rules to a security group
        # if needed.
        nonexisting = set(names) - existing
        if nonexisting:
            raise SecurityGroupError(
                "Security group(s) `{0}` do not exist"
                .format(', '.join(nonexisting)))

        # if we get to this point, all sec groups exist
        return True

    @memoize(120)
    def _get_images(self):
        """Get available images. We cache the results in order to reduce
        network usage.

        """
        self._init_os_api()
        try:
            # python-novaclient < 8.0.0
            return self.nova_client.images.list()
        except AttributeError:
            # ``glance_client.images.list()`` returns a generator, but callers
            # of `._get_images()` expect a Python list
            return list(self.glance_client.images.list())

    @staticmethod
    def _get_fault_message(vm):
        """
        Given a novaclient's `server` object, return the error message
        associated with a fault, or ``None`` if there is no fault or
        no error message was returned by the OS API.
        """
        try:
            return vm.fault['message']
        except (AttributeError, KeyError):
            # no fault, or fault message unspecified
            return None

    def _get_volumes(self):
        """Return list of available volumes."""
        self._init_os_api()
        return dict((volume.name, volume)
                    for volume in self.cinder_client.volumes.list())

    @memoize(120)
    def _get_flavors(self):
        """Get available flavors. We cache the results in order to reduce
        network usage.

        """
        self._init_os_api()
        return self.nova_client.flavors.list()

    def _load_instance(self, instance_id, force_reload=True):
        """
        Return instance with the given id.

        For performance reasons, the instance ID is first searched for in the
        collection of VM instances started by ElastiCluster
        (`self._instances`), then in the list of all instances known to the
        cloud provider at the time of the last update
        (`self._cached_instances`), and finally the cloud provider is directly
        queried.

        :param str instance_id: instance identifier
        :param bool force_reload:
          if ``True``, skip searching caches and reload instance from server
          and immediately reload instance data from cloud provider
        :return: py:class:`novaclient.v1_1.servers.Server` - instance
        :raises: `InstanceError` is returned if the instance can't
                 be found in the local cache or in the cloud.
        """
        self._init_os_api()
        if force_reload:
            try:
                # Remove from cache and get from server again
                vm = self.nova_client.servers.get(instance_id)
            except NotFound:
                raise InstanceNotFoundError(
                    "Instance `{instance_id}` not found"
                    .format(instance_id=instance_id))
            # update caches
            self._instances[instance_id] = vm
            self._cached_instances[instance_id] = vm

        # if instance is known, return it
        if instance_id in self._instances:
            return self._instances[instance_id]

        # else, check (cached) list from provider
        if instance_id not in self._cached_instances:
            # Refresh the cache, just in case
            self._cached_instances = dict(
                (vm.id, vm) for vm in self.nova_client.servers.list())

        if instance_id in self._cached_instances:
            inst = self._cached_instances[instance_id]
            self._instances[instance_id] = inst
            return inst

        # If we reached this point, the instance was not found neither
        # in the caches nor on the website.
        raise InstanceNotFoundError(
            "Instance `{instance_id}` not found"
            .format(instance_id=instance_id))


    def _allocate_address(self, instance, network_ids):
        """
        Allocates a floating/public ip address to the given instance,
        dispatching to either the Compute or Network API depending
        on installed packages.

        :param instance: instance to assign address to

        :param list network_id: List of IDs (as strings) of networks
          where to request allocation the floating IP.

        :return: public ip address
        """
        log.debug(
            "Trying to allocate floating IP for VM `%s` on network(s) %r",
            instance.id, network_ids)
        try:
            # on python-novaclient>=8.0.0 this fails with
            # `AttributeError` since the `Client.floating_ips`
            # attribute has been removed
            return self._allocate_address_nova(instance, network_ids)
        except AttributeError:
            return self._allocate_address_neutron(instance, network_ids)


    def _allocate_address_nova(self, instance, network_ids):
        """
        Allocates a floating/public ip address to the given instance,
        using the OpenStack Compute ('Nova') API.

        :param instance: instance to assign address to

        :param list network_id: List of IDs (as strings) of networks
          where to request allocation the floating IP.  **Ignored**
          (only used by the corresponding Neutron API function).

        :return: public ip address
        """
        self._init_os_api()
        with OpenStackCloudProvider.__node_start_lock:
            # Use the `novaclient` API (works with python-novaclient <8.0.0)
            free_ips = [ip for ip in self.nova_client.floating_ips.list() if not ip.fixed_ip]
            if not free_ips:
                log.debug("Trying to allocate a new floating IP ...")
                free_ips.append(self.nova_client.floating_ips.create())
            if free_ips:
                ip = free_ips.pop()
            else:
                raise RuntimeError(
                    "Could not allocate floating IP for VM {0}"
                    .format(instance_id))
            instance.add_floating_ip(ip)
        return ip.ip


    def _allocate_address_neutron(self, instance, network_ids):
        """
        Allocates a floating/public ip address to the given instance,
        using the OpenStack Network ('Neutron') API.

        :param instance: instance to assign address to
        :param list network_id:
          List of IDs (as strings) of networks where to
          request allocation the floating IP.

        :return: public ip address
        """
        self._init_os_api()
        with OpenStackCloudProvider.__node_start_lock:
            # Note: to return *all* addresses, all parameters to
            # `neutron_client.list_floatingips()` should be left out;
            # setting them to `None` (e.g., `fixed_ip_address=None`)
            # results in an empty list...
            free_ips = [
                ip for ip in
                self.neutron_client.list_floatingips().get('floatingips')
                if (ip['floating_network_id'] in network_ids
                    # keep only unallocated IP addrs
                    and ip['fixed_ip_address'] is None
                    and ip['port_id'] is None)
            ]
            if free_ips:
                floating_ip = free_ips.pop()
                log.debug("Using existing floating IP %r", floating_ip)
            else:
                # FIXME: OpenStack Network API v2 requires that we specify
                # a network ID along with the request for a floating IP.
                # However, ElastiCluster configuration allows for multiple
                # networks to be connected to a VM, but does not give any
                # hint as to which one(s) should be used for such requests.
                # So we try them all, ignoring errors until one request
                # succeeds and hope that it's OK. One can imagine
                # scenarios where this is *not* correct, but: (1) these
                # scenarios are unlikely, and (2) the old novaclient code
                # above has not even had the concept of multiple networks
                # for floating IPs and no-one has complained in 5 years...
                for network_id in network_ids:
                    log.debug(
                        "Trying to allocate floating IP on network %s ...", network_id)
                    try:
                        floating_ip = self.neutron_client.create_floatingip({
                            'floatingip': {
                                'floating_network_id':network_id,
                            }}).get('floatingip')
                        log.debug(
                            "Allocated IP address %s on network %s",
                            floating_ip['floating_ip_address'], network_id)
                        break  # stop at first network where we get a floating IP
                    except BadNeutronRequest as err:
                        raise RuntimeError(
                            "Failed allocating floating IP on network {0}: {1}"
                            .format(network_id, err))
            if floating_ip.get('floating_ip_address', None) is None:
                raise RuntimeError(
                    "Could not allocate floating IP for VM {0}"
                    .format(instance_id))
            # wait until at least one interface is up
            interfaces = []
            # FIXMEE: no timeout!
            while not interfaces:
                interfaces = instance.interface_list()
                sleep(2)  ## FIXME: hard-coded value
            # get port ID
            for interface in interfaces:
                log.debug(
                    "Instance %s (ID: %s):"
                    " Checking if floating IP can be attached to interface %r ...",
                    instance.name, instance.id, interface)
                # if interface.net_id not in network_ids:
                #     log.debug(
                #         "Instance %s (ID: %s):"
                #         " Skipping interface %r:"
                #         " not attached to any of the requested networks.",
                #         instance.name, instance.id, interface)
                #     continue
                port_id = interface.port_id
                if port_id is None:
                    log.debug(
                        "Instance %s (ID: %s):"
                        " Skipping interface %r: no port ID!",
                        instance.name, instance.id, interface)
                    continue
                log.debug(
                    "Instance `%s` (ID: %s):"
                    " will assign floating IP to port ID %s (state: %s),"
                    " already running IP addresses %r",
                    instance.name, instance.id,
                    port_id, interface.port_state,
                    [item['ip_address'] for item in interface.fixed_ips])
                if interface.port_state != 'ACTIVE':
                    log.warn(
                        "Instance `%s` (ID: %s):"
                        " port `%s` is in state %s (epected 'ACTIVE' instead)",
                        instance.name, instance.id,
                        port_id, interface.port_state)
                break
            else:
                raise RuntimeError(
                    "Could not find port on network(s) {0}"
                    " for instance {1} (ID: {2}) to bind a floating IP to."
                    .format(network_ids, instance.name, instance.id))
            # assign floating IP to port
            floating_ip = self.neutron_client.update_floatingip(
                floating_ip['id'], {
                    'floatingip': {
                        'port_id': port_id,
                    },
                }
            ).get('floatingip')
            ip_address = floating_ip['floating_ip_address']
            log.debug("Assigned IP address %s to port %s", ip_address, port_id)

            log.info("Waiting 300s until floating IP %s is ACTIVE", ip_address)
            for i in range(300):
                _floating_ip = self.neutron_client.show_floatingip(floating_ip['id'])
                if _floating_ip['floatingip']['status'] != 'DOWN':
                    break
                sleep(1)

            # Invalidate cache for this VM, as we just assigned a new IP
            if instance.id in self._cached_instances:
                del self._cached_instances[instance.id]
        return ip_address


    # Fix pickler
    def __getstate__(self):
        return {'auth_url': self._os_auth_url,
                'username': self._os_username,
                'password': self._os_password,
                'project_name': self._os_tenant_name,
                'project_domain_name': self._os_project_domain_name,
                'user_domain_name': self._os_user_domain_name,
                'region_name': self._os_region_name,
                'request_floating_ip': self.request_floating_ip,
                'instance_ids': list(self._instances.keys()),
                'compute_api_version': self.compute_api_version,
            }

    def __setstate__(self, state):
        self._os_auth_url = state['auth_url']
        self._os_username = state['username']
        self._os_password = state['password']
        self._os_tenant_name = state['project_name']
        self._os_user_domain_name = state['user_domain_name']
        self._os_project_domain_name = state['project_domain_name']
        self._os_region_name = state['region_name']
        self.request_floating_ip = state['request_floating_ip']
        self._compute_api_version = state.get('compute_api_version', DEFAULT_COMPUTE_API_VERSION),
        self._identity_api_version = state.get('identity_api_version', DEFAULT_IDENTITY_API_VERSION),
        self._image_api_version = state.get('image_api_version', DEFAULT_IMAGE_API_VERSION),
        self._network_api_version = state.get('network_api_version', DEFAULT_NETWORK_API_VERSION),
        self._volume_api_version = state.get('volume_api_version', DEFAULT_VOLUME_API_VERSION),
        self._instances = {}
        self._cached_instances = {}
        # these will be initialized later by `_init_os_api()`
        self.nova_client = None
        self.neutron_client = None
        self.glance_client = None
        self.cinder_client = None