Source code for arc.job.local

"""
A module for running jobs on the local machine.
When transitioning to Python 3, use
`subprocess.run() <https://docs.python.org/3/library/subprocess.html#subprocess.run>`_
"""

import datetime
import os
import re
import shutil
import subprocess
import time
from typing import List, Optional, Tuple, Union

from arc.common import get_logger
from arc.exceptions import SettingsError
from arc.imports import settings
from arc.job.ssh import check_job_status_in_stdout


logger = get_logger()

servers, check_status_command, submit_command, submit_filenames, delete_command, output_filenames = \
    settings['servers'], settings['check_status_command'], settings['submit_command'], settings['submit_filenames'],\
    settings['delete_command'], settings['output_filenames']



[docs]
def execute_command(command: Union[str, List[str]],
                    shell: bool = True,
                    no_fail: bool = False,
                    executable: Optional[str] = None,
                    ) -> Tuple[Optional[list], Optional[list]]:
    """
    Execute a command.

    Notes:
        If ``no_fail`` is ``True``, then a warning is logged and ``False`` is returned
        so that the calling function can debug the situation.

    Args:
        command (Union[str, List[str]]): An array of string commands to send.
        shell (bool, optional): Specifies whether the command should be executed using bash instead of Python.
        no_fail (bool, optional): If ``True`` then ARC will not crash if an error is encountered.
        executable (str, optional): Select a specific shell to run with, e.g., '/bin/bash'.
                                    Default shell of the subprocess command is '/bin/sh'.

    Returns: Tuple[list, list]:
        - A list of lines of standard output stream.
        - A list of lines of the standard error stream.
    """
    error = None
    if not isinstance(command, list):
        command = [command]
    command = [' && '.join(command)]
    i, max_times_to_try = 1, 30
    sleep_time = 60  # Seconds
    while i < max_times_to_try:
        try:
            if executable is None:
                completed_process = subprocess.run(command, shell=shell, capture_output=True)
            else:
                completed_process = subprocess.run(command, shell=shell, capture_output=True, executable=executable)
            return _format_stdout(completed_process.stdout), _format_stdout(completed_process.stderr)
        except subprocess.CalledProcessError as e:
            error = e  # Store the error so we can raise a SettingsError if needed.
            if no_fail:
                _output_command_error_message(command, e, logger.warning)
                return None, None
            else:
                _output_command_error_message(command, e, logger.error)
                logger.error(f'ARC is sleeping for {sleep_time * i} seconds before retrying.\nPlease check whether '
                             f'this is a server issue by executing the command manually on the server.')
                logger.info('ZZZZZ..... ZZZZZ.....')
                time.sleep(sleep_time * i)  # In seconds
                i += 1
    # If unsuccessful:
    raise SettingsError(f'The command "{command}" is erroneous, got: \n{error}'
                        f'\nThis maybe either a server issue or the command is wrong.'
                        f'\nTo check if this is a server issue, please run the command on server and restart ARC.'
                        f'\nTo correct the command, modify settings.py'
                        f'\nTips: use "which" command to locate cluster software commands on server.'
                        f'\nExample: type "which sbatch" on a server running Slurm to find the correct '
                        f'sbatch path required in the submit_command dictionary.')



def _output_command_error_message(command: List[str],
                                  error: subprocess.CalledProcessError,
                                  logging_func: Union[logger.warning, logger.error],
                                  ) -> None:
    """
    Formats and logs the error message returned from a command at the desired logging level

    Args:
        command (List[str]): The command that threw the error.
        error (subprocess.CalledProcessError): The exception caught by python from subprocess.
        logging_func: ``logging.warning`` or ``logging.error`` as a function object.
    """
    logging_func('The server command is erroneous.')
    logging_func(f'Tried to submit the following command:\n{command}')
    logging_func('And got the following status (cmd, message, output, return code)')
    logging_func(error.cmd)
    logger.info('\n')
    logging_func(error)
    logger.info('\n')
    logging_func(error.output)
    logger.info('\n')
    logging_func(error.returncode)


def _format_stdout(stdout: bytes) -> List[str]:
    """
    Format the stdout as a list of unicode strings

    Args:
        stdout (bytes): The standard output.

    Returns:
        List(str): The decoded lines from stdout.
    """
    lines, list_of_strs = stdout.splitlines(), list()
    for line in lines:
        list_of_strs.append(line.decode())
    return list_of_strs



[docs]
def check_job_status(job_id: int) -> str:
    """
    Possible status values: ``before_submission``, ``running``, ``errored on node xx``, ``done``
    Status line formats:

    OGE::

        540420 0.45326 xq1340b    user_name       r     10/26/2018 11:08:30 long1@node18.cluster

    Slurm::

        14428     debug xq1371m2   user_name  R 50-04:04:46      1 node06

    PBS (taken from zeldo.dow.com)::
                                                                                         Req'd       Req'd       Elap
        Job ID                  Username    Queue    Jobname         SessID  NDS   TSK   Memory      Time    S   Time
        ----------------------- ----------- -------- --------------- ------ ----- ------ --------- --------- - ---------
        2016614.zeldo.local     u780444     workq    scan.pbs         75380     1     10       --  730:00:00 R  00:00:20
        2016616.zeldo.local     u780444     workq    scan.pbs         75380     1     10       --  730:00:00 R  00:00:20

    HTCondor (using ARC's modified condor_q command)::

        3261.0 R 10 28161 a2719 56
        3263.0 R 10 28161 a2721 23
        3268.0 R 10 28161 a2726 18
        3269.0 R 10 28161 a2727 17
        3270.0 P 10 28161 a2728 23
    """
    server = 'local'
    cmd = check_status_command[servers[server]['cluster_soft']]
    stdout = execute_command(cmd)[0]
    return check_job_status_in_stdout(job_id=job_id, stdout=stdout, server=server)




[docs]
def delete_job(job_id: Union[int, str]):
    """
    Deletes a running job.
    """
    cmd = f"{delete_command[servers['local']['cluster_soft']]} {job_id}"
    success = not bool(execute_command(cmd, no_fail=True)[1])
    if not success:
        logger.warning(f'Detected possible error when trying to delete job {job_id}. Checking to see if the job is '
                       f'still running...')
        running_jobs = check_running_jobs_ids()
        if job_id in running_jobs:
            logger.error(f'Job {job_id} was scheduled for deletion, but the deletion command has appeared to errored. '
                         f'The job is still running.')
            raise RuntimeError(f'Could not delete job {job_id}')
        else:
            logger.info(f'Job {job_id} is no longer running.')




[docs]
def check_running_jobs_ids() -> List[str]:
    """
    Check which jobs are still running on the server for this user.

    Returns:
        List[str]: List of job IDs.
    """
    cluster_soft = servers['local']['cluster_soft'].lower()
    if cluster_soft not in ['slurm', 'oge', 'sge', 'pbs', 'htcondor']:
        raise ValueError(f"Server cluster software {servers['local']['cluster_soft']} is not supported.")
    cmd = check_status_command[servers['local']['cluster_soft']]
    stdout = execute_command(cmd)[0]
    running_job_ids = parse_running_jobs_ids(stdout, cluster_soft=cluster_soft)
    return running_job_ids




[docs]
def parse_running_jobs_ids(stdout: List[str],
                           cluster_soft: Optional[str] = None,
                           ) -> List[str]:
    """
    A helper function for parsing job IDs from the stdout of a job status command.

    Args:
        stdout (List[str]): The stdout of a job status command.
        cluster_soft (Optional[str]): The cluster software.

    Returns:
        List(str): List of job IDs.
    """
    cluster_soft = cluster_soft or servers['local']['cluster_soft'].lower()
    i_dict = {'slurm': 0, 'oge': 1, 'sge': 1, 'pbs': 4, 'htcondor': -1}
    split_by_dict = {'slurm': ' ', 'oge': ' ', 'sge': ' ', 'pbs': '.', 'htcondor': '.'}
    running_job_ids = list()
    for i, status_line in enumerate(stdout):
        if i > i_dict[cluster_soft]:
            job_id = status_line.strip().split(split_by_dict[cluster_soft])[0]
            job_id = f'{job_id}'  # job_id is sometimes a byte, this transforms b'bytes' into "b'bytes'"
            if "b'" in job_id:
                job_id = job_id.split("b'")[1].split("'")[0]
            running_job_ids.append(job_id)
    return running_job_ids




[docs]
def submit_job(path: str,
               cluster_soft: Optional[str] = None,
               submit_cmd: Optional[str] = None,
               submit_filename: Optional[str] = None,
               recursion: bool = False,
               ) -> Tuple[Optional[str], Optional[str]]:
    """
    Submit a job.

    Args:
        path (str): The job's folder path, where the submit script is located (just the folder path, w/o the filename).
        cluster_soft (str, optional): The server cluster software.
        submit_cmd (str, optional): The submit command.
        submit_filename (str, optional): The submit script file name.
        recursion (bool, optional): Whether this call is within a recursion.

    Returns:
        Tuple[Optional[str], Optional[str]]: job_status, job_id
    """
    cluster_soft = cluster_soft or servers['local']['cluster_soft']
    job_status, job_id = '', ''
    submit_cmd = submit_cmd or submit_command[cluster_soft]
    submit_filename = submit_filename or submit_filenames[cluster_soft]
    cmd = f'cd "{path}"; {submit_cmd} {submit_filename}'
    stdout, stderr = execute_command(cmd)
    if not len(stdout):
        time.sleep(10)
        stdout, stderr = execute_command(cmd)
    if stderr:
        if cluster_soft.lower() == 'slurm' and any('AssocMaxSubmitJobLimit' in err_line for err_line in stderr):
            logger.warning(f'Max number of submitted jobs was reached, sleeping...')
            time.sleep(5 * 60)
            submit_job(path=path,
                       cluster_soft=cluster_soft,
                       submit_cmd=submit_cmd,
                       submit_filename=submit_filename,
                       recursion=True,
                       )
        if cluster_soft.lower() == 'pbs' and  (any('qsub: would exceed' in err_line for err_line in stderr ) or any('qsub: Maximum number of jobs' in err_line for err_line in stderr)):
            logger.warning(f'Max number of submitted jobs was reached, sleeping...')
            time.sleep(5 * 60)
            submit_job(path=path,
                       cluster_soft=cluster_soft,
                       submit_cmd=submit_cmd,
                       submit_filename=submit_filename,
                       recursion=True,
                       )
        elif cluster_soft.lower() == 'pbs' and any('qsub: Illegal attribute or resource value' in err_line for err_line in stderr):
            raise ValueError(f'Got the following error when trying to submit job:\n{stderr}. Please check your submit script')
    if not len(stdout) or recursion:
        return None, None
    if len(stderr) > 0 or len(stdout) == 0:
        logger.warning(f'Got the following error when trying to submit job:\n{stderr}.')
        job_status = 'errored'
    else:
        job_id = _determine_job_id(stdout=stdout, cluster_soft=cluster_soft)
    job_status = 'running' if job_id else job_status
    return job_status, job_id



def _determine_job_id(stdout: List[str],
                      cluster_soft: Optional[str] = None
                      ) -> str:
    """
    Determine the job ID right after it was submitted from the stdout.

    Args:
        stdout (List[str]): The stdout got from submitting a job.
        cluster_soft (str, optional): The server cluster software.

    Returns:
        str: The determined job ID.
    """
    job_id = ''
    cluster_soft = cluster_soft or servers['local']['cluster_soft']
    cluster_soft = cluster_soft.lower() if cluster_soft is not None else None
    if cluster_soft in ['oge', 'sge'] and 'submitted' in stdout[0].lower():
        job_id = stdout[0].split()[2]
    elif cluster_soft == 'slurm' and 'submitted' in stdout[0].lower():
        job_id = stdout[0].split()[3]
    elif cluster_soft == 'pbs':
        job_id = stdout[0].split('.')[0]
    elif cluster_soft == 'htcondor' and 'submitting' in stdout[0].lower():
        if len(stdout) and len(stdout[1].split()) and len(stdout[1].split()[-1].split('.')):
            job_id = stdout[1].split()[-1].split('.')[0]
    else:
        raise ValueError(f'Unrecognized cluster software: {cluster_soft}')
    return job_id



[docs]
def get_last_modified_time(file_path_1: str,
                           file_path_2: Optional[str] = None,
                           ) -> Optional[datetime.datetime]:
    """
    Returns the last modified time of ``file_path_1`` if the file exists,
    else returns the last modified time of ``file_path_2`` if the file exists.

    Args:
        file_path_1 (str): The path to file 1.
        file_path_2 (str, optional): The path to file 2.
    """
    timestamp = None
    if os.path.isfile(file_path_1):
        try:
            timestamp = os.stat(file_path_1).st_mtime
        except (IOError, OSError):
            pass
    if timestamp is None and file_path_2 is not None:
        try:
            timestamp = os.stat(file_path_2).st_mtime
        except (IOError, OSError):
            return None
    return datetime.datetime.fromtimestamp(timestamp) if timestamp is not None else None




[docs]
def write_file(file_path: str, file_string: str) -> None:
    """
    Write ``file_string`` as the file's content in ``file_path``.

    Args:
        file_path (str): The file path.
        file_string (str): The content to be written into the file.
    """
    with open(file_path, 'w') as f:
        f.write(file_string)




[docs]
def rename_output(local_file_path: str,
                  software: str,
                  ) -> None:
    """
    Rename the output file to "output.out" for consistency between software.

    Args:
        local_file_path (str): The full path to the output.out file.
        software (str): The software used for the job by which the original output file name was determined.
    """
    software = software.lower()

    for i in range(5):
        if not os.path.isfile(local_file_path) \
                and not os.path.isfile(os.path.join(os.path.dirname(local_file_path), output_filenames[software])):
            # Wait for file to be transferred on the server (the head node might be busy).
            time.sleep(6)
        else:
            break
    else:
        # Nothing to rename.
        return None

    if os.path.isfile(os.path.join(os.path.dirname(local_file_path), output_filenames[software])):
        shutil.move(src=os.path.join(os.path.dirname(local_file_path), output_filenames[software]), dst=local_file_path)




[docs]
def change_mode(mode: str,
                file_name: str,
                recursive: bool = False,
                path: str = '',
                ) -> None:
    """
    Change the mode of a file or a directory.

    Args:
        mode (str): The mode change to be applied, can be either octal or symbolic.
        file_name (str): The path to the file or the directory to be changed.
        recursive (bool, optional): Whether to recursively change the mode to all files
                                    under a directory. ``True`` for recursively change.
        path (str, optional): The directory path at which the command will be executed.
    """
    if os.path.isfile(path):
        path = os.path.dirname(path)
    recursive = ' -R' if recursive else ''
    command = [f'cd "{path}"'] if path else []
    command.append(f'chmod{recursive} {mode} {file_name}')
    execute_command(command=command)




[docs]
def delete_all_local_arc_jobs(jobs: Optional[List[Union[str, int]]] = None) -> None:
    """
    Delete all ARC-spawned jobs (with job name starting with `a` and a digit) from the local server.
    Make sure you know what you're doing, so unrelated jobs won't be deleted...
    Useful when terminating ARC while some (ghost) jobs are still running.

    Args:
        jobs (List[Union[str, int]], optional): Specific ARC job IDs to delete.
    """
    server = 'local'
    if server in servers:
        print('\nDeleting all ARC jobs from local server...')
        cmd = check_status_command[servers[server]['cluster_soft']]
        stdout = execute_command(cmd, no_fail=True)[0]
        for status_line in stdout:
            s = re.search(r' a\d+', status_line)
            if s is not None:
                job_name = s.group()[1:]
                cluster_soft = servers[server]['cluster_soft'].lower()
                server_job_id = None
                if jobs is None or job_name in jobs:
                    if cluster_soft == 'slurm':
                        server_job_id = status_line.split()[0]
                        delete_job(server_job_id)
                    elif cluster_soft == 'pbs':
                        server_job_id = status_line.split()[0]
                        delete_job(server_job_id)
                    elif cluster_soft in ['oge', 'sge']:
                        delete_job(job_name)
                    elif cluster_soft == 'htcondor':
                        server_job_id = status_line.split()[0].split('.')[0]
                        delete_job(server_job_id)
                    else:
                        raise ValueError(f'Unrecognized cluster software {cluster_soft}.')
                    aux_text = f' ({server_job_id} on server)' if server_job_id is not None else ''
                    print(f'deleted job {job_name}{aux_text}.')
        print('\ndone.')