blob: fe078fa29e84f0e8d562ecaf0afa9a7962c116e6 [file] [log] [blame]
# coding: utf-8
# Copyright 2013 The LUCI Authors. All rights reserved.
# Use of this source code is governed under the Apache License, Version 2.0
# that can be found in the LICENSE file.
"""bot_config for swarming bots running in Home CI.
A copy of this file is downloaded to every swarming bot automatically.
Updates to this file are propogated to bots within ~10 minutes.
Prerequisites:
- bot has eurtest_libs installed via: cipd install home_internal/eurtest_libs
- eurtest_libs directory is in PYTHONPATH
There's 3 types of functions in this file:
- get_*() to return properties to describe this bot.
- on_*() as hooks based on events happening on the bot.
- setup_*() to setup global state on the host.
"""
import collections
import json
import logging
import os
import re
import sys
import time
from api import os_utilities
from api import platforms
# Hack to de-prioritize sys.path variables that bot_code imports which are
# incompatible with the libraries we're importing below.
# see https://crbug.com/758609
sys.path.sort(key=lambda x: 'swarm_env' not in x)
# each bot must be run in a virtualenv that has eurtest_libs in PYTHONPATH
try:
from eurtest.device import device as device_lib
from utilities import ssh_helper
except ImportError as e:
sys.stderr.write('Exception: %s\n' % e)
sys.stderr.write('sys.path:\n%s' % str(sys.path))
# Temporary while debugging issue
import google
print 'google.__path__: %s' % str(google.__path__)
raise
# Unused argument 'bot' - pylint: disable=W0613
DeviceBot = collections.namedtuple('DeviceBot', 'dimensions state')
def device(device_type, ip, groups=None, nic=None, serial=None, extra_state=None):
if not hasattr(ip, '__iter__'):
ip = [ip]
dimensions = {
u'device_ip': ip,
u'device_type': [device_type],
u'group': groups or ['COMMON'],
u'nic': [nic or 'ethernet'],
}
# You can't advertise a key that cannot be selected,
# so only set a dimension key if the value is not None
if serial:
dimensions[u'serial'] = [serial]
if len(ip) > 1:
dimensions[u'group'].append('multiconfig_box')
return DeviceBot(dimensions, extra_state)
_BOT_DEVICE_MAP = {
### home-demo pool ### - enforced via bot name prefix
'homedemo--1': device('assistantdefault', '1.2.3.4', ['abc_test']),
}
# global for caching device properties gathered dynamically during runtime
_DEVICE = None
### Helper Methods
class SSHError(Exception):
pass
class Device(object):
"""Retrieves device property information for a device over SSH."""
# amount of time (secs) until state is due for a refresh
_STALE_STATE_DELTA = 60
def __init__(self, device_ip, bot_id):
self.device_ip = device_ip
self.bot_id = bot_id
self._device_client = device_lib.Device(
ethernet_ip=self.device_ip)
self._device_client.UseSsh()
self._last_connect_succeeded = False
self._properties = {}
self._stale_state_ts = 0
@property
def build_fingerprint(self):
return self.properties.get('ro.build.fingerprint', None)
@property
def hardware(self):
return self.properties.get('ro.hardware', None)
@property
def is_reachable(self):
"""Checks if device is reachable over ssh."""
if self._properties is None or time.time() > self._stale_state_ts:
self._properties = self._fetch_properties()
return self._last_connect_succeeded
@property
def product(self):
return self.properties.get('ro.build.product', None)
@property
def properties(self):
"""Returns all of the last-known device properties as a dict."""
if self._properties is None or time.time() > self._stale_state_ts:
try:
# Keep last-known properties when SSHError is raised
self._properties = self._fetch_properties()
except SSHError as e:
logging.exception(e)
return self._properties
def _exec_ssh_cmd(self, cmd):
"""Executes the given cmd against the device over SSH.
Arguments:
- cmd: the command to be run on the device over ssh.
Returns:
stdout of cmd output
Raises:
SSHError: upon ssh cmd exec failure
"""
# Temporary: fetch device properties from static local file when present.
# This let's us demo what our swarm bots will look like, even though the
# demo host ('voyager') is on a lab network in WAT that can't reach the
# MTV devices over ssh. The real device properties are fetched over ssh
# by a script on Corp and copied to the swarm host before starting bots.
# TODO(jonesmi): remove this temporary workaround when bots run in MTV lab
bot_common_dir = os.getenv('BOT_COMMON_DIR')
if bot_common_dir:
props_file = os.path.join(bot_common_dir, '%s.props' % self.bot_id)
if os.path.isfile(props_file):
with open(props_file, 'r') as f:
# pretend that we read this from SSH connection
self._last_connect_succeeded = True
return f.read()
try:
if self._device_client.Cmd(cmd, timeout_secs=30):
self._last_connect_succeeded = True
return self._device_client.GetCmdOutput()
else:
self._last_connect_succeeded = False
raise SSHError('failed to execute cmd over ssh: %s' % cmd)
except ssh_helper.CommandThreadTimeoutError:
self._last_connect_succeeded = False
raise SSHError('timeout when waiting for ssh connect: %s' % cmd)
def _fetch_properties(self):
"""Fetches device properties over ssh using getprop command."""
self._stale_state_ts = time.time() + self._STALE_STATE_DELTA
properties = {}
stdout = self._exec_ssh_cmd('getprop')
for line in stdout.strip().split('\n'):
match = re.match(r'\[(.*)\]: \[(.*)\]', line)
if match:
properties[match.group(1)] = match.group(2)
return properties
### _get* Hooks
def get_dimensions(bot):
# pylint: disable=line-too-long
"""Returns dict with the bot's dimensions.
The dimensions are what are used to select the bot that can run each task.
The bot id will be automatically selected based on the hostname with
os_utilities.get_dimensions(). If you want something more special, specify it
in your bot_config.py and override the item 'id'.
The dimensions returned here will be joined with server defined dimensions
(extracted from bots.cfg config file based on the bot id). Server defined
dimensions override the ones provided by the bot. See bot.Bot.dimensions for
more information.
See https://github.com/luci/luci-py/tree/master/appengine/swarming/doc/Magic-Values.md.
Arguments:
- bot: bot.Bot instance or None. See ../api/bot.py.
"""
dimensions = os_utilities.get_dimensions()
# The bot base directory is formatted like <HOME>/bots/<Id>
id = '%s--%s' % (
os_utilities.get_hostname_short(),
os.path.basename(bot.base_dir))
dimensions[u'id'] = [id]
if id in _BOT_DEVICE_MAP:
dimensions.update(_BOT_DEVICE_MAP[id].dimensions)
global _DEVICE
if not _DEVICE:
device_ip = dimensions[u'device_ip']
# use first device_ip in list of device IPs for pulling device attributes
if device_ip:
assert isinstance(device_ip, (list, tuple)), repr(device_ip)
_DEVICE = Device(device_ip[0], id)
if _DEVICE:
dimensions[u'build'] = _DEVICE.build_fingerprint,
dimensions[u'hardware'] = _DEVICE.hardware,
dimensions[u'product'] = _DEVICE.product,
# TODO(jonesmi): don't strip these anymore after swarming fix goes in for limit on # dimensions
del dimensions[u'cores']
del dimensions[u'cpu']
del dimensions[u'gpu']
del dimensions[u'machine_type']
return dimensions
def get_state(bot):
# pylint: disable=line-too-long
"""Returns dict with a state of the bot reported to the server with each poll.
It is only for dynamic state that changes while bot is running for information
for the sysadmins.
The server can not use this state for immediate scheduling purposes (use
'dimensions' for that), but it can use it for maintenance and bookkeeping
tasks.
See https://github.com/luci/luci-py/tree/master/appengine/swarming/doc/Magic-Values.md.
Arguments:
- bot: bot.Bot instance or None. See ../api/bot.py.
"""
state = os_utilities.get_state()
if _DEVICE:
state[u'device'] = _DEVICE.properties
if _DEVICE.bot_id in _BOT_DEVICE_MAP:
extra_state = _BOT_DEVICE_MAP[_DEVICE.bot_id].state
if extra_state:
state[u'device'].update(extra_state)
if not _DEVICE.is_reachable:
state[u'quarantined'] = 'device is not reachable'
return state
def get_authentication_headers(bot):
"""Returns authentication headers and their expiration time.
The returned headers will be passed with each HTTP request to the Swarming
server (and only Swarming server). The bot will use the returned headers until
they are close to expiration (usually 6 min, see AUTH_HEADERS_EXPIRATION_SEC
in remote_client.py), and then it'll attempt to refresh them by calling
get_authentication_headers again.
Can be used to implement per-bot authentication. If no headers are returned,
the server will use only IP whitelist for bot authentication.
On GCE will use OAuth token of the default GCE service account. It should have
"User info" API scope enabled (this can be set when starting an instance). The
server should be configured (via bots.cfg) to trust this account (see
'require_service_account' in bots.proto).
May be called by different threads, but never concurrently.
Arguments:
- bot: bot.Bot instance. See ../api/bot.py.
Returns:
Tuple (dict with headers or None, unix timestamp of when they expire).
"""
if platforms.is_gce():
tok = platforms.gce.oauth2_access_token()
return {'Authorization': 'Bearer %s' % tok}, time.time() + 5*60
return (None, None)
### on_* Hooks
def on_bot_shutdown(bot):
"""Hook function called when the bot shuts down, usually rebooting.
It's a good time to do other kinds of cleanup.
Arguments:
- bot: bot.Bot instance. See ../api/bot.py.
"""
pass
def on_handshake(bot):
"""Hook function called when the bot starts, after handshake with the server.
Here the bot already knows server enforced dimensions (defined in server side
bots.cfg file).
This is called right before starting to poll for tasks. It's a good time to
do some final initialization or cleanup that may depend on server provided
configuration.
Arguments:
- bot: bot.Bot instance. See ../api/bot.py.
"""
pass
def on_before_task(bot, bot_file):
"""Hook function called before running a task.
It shouldn't do much, since it can't cancel the task so it shouldn't do
anything too fancy.
Arguments:
- bot: bot.Bot instance. See ../api/bot.py.
- bot_file: Path to file to write information about the state of the bot.
This file can be used to pass certain info about the bot
to tasks, such as which connected android devices to run on. See
https://github.com/luci/luci-py/tree/master/appengine/swarming/doc/Magic-Values.md#run_isolated
"""
with open(bot_file, 'wb') as f:
json.dump(bot.dimensions, f)
def on_after_task(bot, failure, internal_failure, dimensions, summary):
"""Hook function called after running a task.
It is an excellent place to do post-task cleanup of temporary files.
The default implementation restarts after a task failure or an internal
failure.
Arguments:
- bot: bot.Bot instance. See ../api/bot.py.
- failure: bool, True if the task failed.
- internal_failure: bool, True if an internal failure happened.
- dimensions: dict, Dimensions requested as part of the task.
- summary: dict, Summary of the task execution.
"""
# Example code:
#if failure:
# bot.restart('Task failure')
#elif internal_failure:
# bot.restart('Internal failure')
def on_bot_idle(bot, since_last_action):
"""Hook function called once when the bot has been idle; when it has no
command to execute.
This is an excellent place to put device in 'cool down' mode or any
"pre-warming" kind of stuff that could take several seconds to do, that would
not be appropriate to do in on_after_task(). It could be worth waiting for
`since_last_action` to be several seconds before doing a more lengthy
operation.
This function is called repeatedly until an action is taken (a task, updating,
etc).
This is a good place to do "auto reboot" for hardware based bots that are
rebooted periodically.
Arguments:
- bot: bot.Bot instance. See ../api/bot.py.
- since_last_action: time in second since last action; e.g. amount of time the
bot has been idle.
"""
pass