import argparse import filecmp import glob import logging import os import re import subprocess as sp import xml.etree.ElementTree as ET from collections import defaultdict from datetime import datetime, timedelta from pprint import pformat from typing import Any, Dict, List, Set, Tuple import requests from .utils import ( LogHistoryHandler, can_be_affected, handle_processing_error, jenkins_job_is_running, make_gitlab_mr_branch_name_and_title, mrs_have_changed, rq_get_to_json, rq_post_to_json, slot_merges, ) # FIXME List # 1. Scenario new test, but no ref file, or only on of (avx, default), what happens? # Does expandvars work if no file is on disk? # 2. notrun tests are currently not really properly accounted for # we don't AFAIK have any with refs, so they just get filtered out anyway # But we should double check how to approach platform dependent skips of tests # Also we now have proper dependency management of tests, so if A fails B and C are skipped if they depend on A # Here we would warn about A though, so that could be enough # 3. Ensure SLL certs are installed when using this bot # 4. Implement creation of MR and we probably want to somwhow mention in the MR description some info on what warnings were encountered # 5. what if there is a new test? ref from v4 platform won't have v4 suffix -> need to check for diff and add suffix # 6. we don't really fully check shared refs across tests. e.g. MiniBrunel Hive which uses the ref from single threaded test # in create_ref...() there is a check that if a sym linked ref changes, the original changes too, but # more detailed checks or checking the other direction is missing. # FIXME # gauss online disabled # gauss merges show up in branch name if it isn't disabled IGNORED_PROJECTS = [ "LCG", "PARAM", "DBASE", "Gaudi", "Online", "LHCbIntegrationTests", "Panoramix", "GaussinoExtLibs", ] # FIXME If Gaudi tests are failing, should we warn about maybe not being sure if # they could cause our diffs? log = logging.getLogger(__name__) # see decorate_logger() LOG_HISTORY: Dict[str, List[str]] = defaultdict(list) # loose type check, but better than nothing # see https://github.com/python/typing/issues/182 JsonDict = Dict[str, Any] COUCH_DB_SERVER = "https://lhcb-couchdb.cern.ch/nightlies-nightly" MRS_VIEW_URL = COUCH_DB_SERVER + "/_design/merge_requests/_view/mrs" NIGHTLIES_XML_URL = "https://lhcb-nightlies.web.cern.ch/api/v1/nightly/" CLANG_VER = "clang19" GCC_VER = "gcc15" OS_VER = "el9" ARM_VER = "armv8.1_a" ALL_PLATFORMS = [ "x86_64_v3-" + OS_VER + "-" + GCC_VER + "-opt+g", "x86_64_v3-" + OS_VER + "-" + GCC_VER + "-dbg", "x86_64_v3-" + OS_VER + "-" + GCC_VER + "+detdesc-opt", "x86_64_v3-" + OS_VER + "-" + GCC_VER + "+detdesc-dbg", "x86_64_v3-" + OS_VER + "-" + CLANG_VER + "-opt", ARM_VER + "-" + OS_VER + "-" + GCC_VER + "-opt", # When we are ready, add x86_64_v4 builds # "x86_64_v4-" + OS_VER + "-" + GCC_VER + "-opt+g", # Until we can run tests for a cuda build, ignore it entirely... # 'x86_64_v3-"+OS_VER+"-gcc12+cuda12_1-opt+g', ] CLANG_PLATFORMS = [p for p in ALL_PLATFORMS if CLANG_VER in p] # define per platform which projects are allowed to # exhibit build failures. # FIXME maybe also make this a CLI option? BUILD_EXCEPTIONS = { "AlignmentOnline": ALL_PLATFORMS, "Panoramix": CLANG_PLATFORMS, "Gaussino": CLANG_PLATFORMS, "Gauss": CLANG_PLATFORMS, } UNSTABLE_TEST_REFS = { # https://gitlab.cern.ch/lhcb/Alignment/-/issues/82 r".*/align-run3-collisions\.ref$", # https://gitlab.cern.ch/lhcb/Moore/-/issues/607 # https://gitlab.cern.ch/lhcb/Moore/-/issues/769 r".*/allen_gaudi_forward_with_mcchecking\.ref", r".*/allen_gaudi_pv_with_mcchecking\.ref", r".*/allen_gaudi_seed_and_match_with_mcchecking\.ref", r".*/allen_gaudi_velo_with_mcchecking\.ref", r".*/hlt1_hlt2_comparison\.ref", r".*/hlt1_hlt2_pvs_vertex_compare\.ref", # https://gitlab.cern.ch/lhcb/Moore/-/issues/780 r".*/hlt2_protoparticles_ttrack_fastest.ref", } def skip_ref_update(plat: str, ref_name: str) -> bool: """Determine if a ref update should be skipped based on platform and reference file name. """ skip = ( "empty.ref" in ref_name or "REF-BOT-SKIP-UPDATE" in ref_name or "SYMLINK-DO_NOT_UPDATE_WITH_NEW" in ref_name or ("detdesc" in plat and "REF-BOT-SKIP-DETDESC-UPDATE" in ref_name) or ("detdesc" not in plat and "REF-BOT-SKIP-DD4HEP-UPDATE" in ref_name) or ("detdesc" in plat and "detdesc" not in ref_name) or (ARM_VER in plat and ARM_VER not in ref_name) or ("dbg" in plat and "dbg" not in ref_name) or (CLANG_VER in plat and CLANG_VER not in ref_name) ) if skip: log.debug(f"SKIPPING UPDATE for ref: {ref_name} platform: {plat}") else: log.debug(f"ALLOWING UPDATE for ref: {ref_name} platform: {plat}") return skip def get_active_projects_in_slot(slot_json: JsonDict) -> JsonDict: """create dictionary containing all active projects in slot key is project name value is dictionary with some basic info that we need later """ # "no_test" is set in couchdb if a project doesn't have any tests # "checkout_opts" gives us the checkout commit plus a list of MRs that were merged during checkout # We filter out projects that weren't build (disabled flag), LCG, and some datapackages return { p["name"]: {**p["checkout_opts"], "no_tests": p["no_test"] if "no_test" in p else False} for p in slot_json["config"]["projects"] if p["name"] not in IGNORED_PROJECTS and not p["disabled"] } def decorate_logger( logger: logging.Logger, history: Dict[str, List[str]], slot_json: JsonDict, ) -> None: """Adds a LogHistoryHandler to the logger history is the dict messages get stored to Messages are stored separately for each active project specified in slot_json """ if logger.handlers: logger.warning(f"Logger {logger} already has extra handlers:\n{logger.handlers}") project_names = list(get_active_projects_in_slot(slot_json).keys()) logger.addHandler(LogHistoryHandler(history, project_names)) def get_projects_base_commit(slot_json: JsonDict) -> Dict[str, str]: """return Dict{project name : commit sha} for projects that are not in IGNORED_PROJECTS these commits represent the current state of reference branches without the MRs from the *-mr slot applied and are our base commit on top of which the references update commit is made """ retV = {} for p in slot_json["config"]["projects"]: n = p["name"] if n not in IGNORED_PROJECTS and not p["disabled"]: if "commit" in p["checkout_opts"]: c = p["checkout_opts"]["commit"] retV[n] = c log.debug(f"Using commit {c} for project {n}") else: log.warning(f"Project {n} has no commit, probably using a tag. Will skip updates !!") return retV def create_ref_update_commit( ref_sha: str, mr_slot_name: str, mr_slot_id: int, proj: str, target_branch: str, plats: Set[str], refs_to_update: List[str], new_refs: List[str], branch_name: str, commit_msg: str, ) -> str: """Download references, checkout project, and create new branch with commit containing updated refs""" log.debug(f"{proj}: refs to update:{pformat(refs_to_update)}") log.debug(f"{proj}: new refs:{pformat(new_refs)}") log.debug(f"{proj}: downloading platforms:{pformat(plats)}") if os.getenv("lhcbsoft_gitlab_token"): log.debug(f"{proj}: Using lhcbsoft gitlab token") gitlab_base_url = "https://ref_bot:$lhcbsoft_gitlab_token@gitlab.cern.ch/lhcb" else: gitlab_base_url = "ssh://git@gitlab.cern.ch:7999/lhcb" sp.check_call( f"( test -d {proj} || git clone {gitlab_base_url}/{proj}.git --depth 1 ) && " f"git -C {proj} fetch --depth 1 --no-tags https://gitlab.cern.ch/lhcb-nightlies/{proj}.git {ref_sha} && " f"git -C {proj} checkout -B tmp {ref_sha}", shell=True, ) get_refs = f""" cd {proj} if curl -L -q -o {proj}.zip https://s3.cern.ch/lhcb-nightlies-artifacts/nightly/{mr_slot_name}/{mr_slot_id}/tests/{{plat}}/newrefs/{proj}.zip ; then unzip -o -qq {proj}.zip rm -rf {proj}.zip fi """ loaded_refs = set() linked_refs = set() skipped_refs = set() created_refs = set() v4_ext = ".x86_64_v4-opt" detdesc_ext = ".detdesc" detdesc_v4_ext = ".x86_64_v4-detdesc-opt" arm_ext = "." + ARM_VER dbg_ext = ".dbg" detdesc_dbg_ext = ".detdesc.dbg" for plat in plats: sp.check_call(get_refs.format(plat=plat), shell=True) for new_ref_path in glob.glob(f"{proj}/**/*.new", recursive=True): ref_path = new_ref_path[:-4] # remove the ".new" suffix ref_rel_path = os.path.relpath(ref_path, proj) if os.path.islink(ref_path): linked_refs.add(ref_rel_path) continue # never update symlinks if skip_ref_update(plat, ref_rel_path): skipped_refs.add(ref_rel_path) continue # Skip ref update based on tag in file name if ref_rel_path in new_refs: if "_v4-" in plat: if "detdesc" in plat: ref_path += detdesc_v4_ext ref_rel_path += detdesc_v4_ext else: ref_path += v4_ext ref_rel_path += v4_ext else: if "detdesc" in plat: if "dbg" in plat: ref_path += detdesc_dbg_ext ref_rel_path += detdesc_dbg_ext else: ref_path += detdesc_ext ref_rel_path += detdesc_ext else: if "dbg" in plat: ref_path += dbg_ext ref_rel_path += dbg_ext if ARM_VER in plat: ref_path += arm_ext ref_rel_path += arm_ext created_refs.add(ref_rel_path) else: loaded_refs.add(ref_rel_path) os.rename(new_ref_path, ref_path) for ref_path in list(created_refs): if "_v4-" in ref_path: continue if ARM_VER in ref_path: continue if "dbg" in ref_path: continue full_path_v3 = proj + "/" + ref_path ref_path_v4 = ref_path + (v4_ext if "detdesc" not in plat else detdesc_v4_ext) full_path_v4 = proj + "/" + ref_path_v4 if filecmp.cmp(full_path_v3, full_path_v4): os.remove(full_path_v4) created_refs.remove(ref_path_v4) ref_path_arm = ref_path + arm_ext full_path_arm = proj + "/" + ref_path_arm if filecmp.cmp(full_path_v3, full_path_arm): os.remove(full_path_arm) created_refs.remove(ref_path_arm) if created_refs: log.info(f"{proj}: creating new refs: {created_refs}") if diff := set(new_refs) - created_refs: log.warning(f"{proj}: Download is missing new reference files: {pformat(diff)}") if diff := loaded_refs - set(refs_to_update): log.warning(f"{proj}: Download contains additinal ref files we didn't expect: {pformat(diff)}") if diff := set(refs_to_update) - loaded_refs - linked_refs - skipped_refs: log.warning(f"{proj}: Download is missing ref files we expected: {pformat(diff)}") for lref in linked_refs: if (orig := os.path.relpath(os.path.realpath(proj + "/" + lref), proj)) not in loaded_refs: log.warning(f"{proj}: Found diff for symlinked ref: {lref}, but original ref: {orig} is unchanged") if not (loaded_refs or created_refs): log.warning(f"{proj}: no references to update!") return "" try: sp.check_call( f"git add -f {' '.join(loaded_refs | created_refs)} && " f"git -c user.name='RefBot' -c user.email='lhcbsoft@cern.ch' commit -m '{commit_msg}\n\n[skip ci]' && " f"git fetch --depth 1 origin {target_branch} && " # just in case we reuse the clone f"git checkout -B {branch_name} FETCH_HEAD && " "git -c user.name='RefBot' -c user.email='lhcbsoft@cern.ch' cherry-pick tmp", shell=True, cwd=proj, ) except sp.CalledProcessError: log.warning(f"Error during git operations, no reference update MR created for {proj}!") return "" return proj def slot_is_ready(slot_json: JsonDict, force: bool) -> JsonDict: """Make sure the tests are finished and builds are okay. Return dict with some metadata for each project to proecess""" active_projects = get_active_projects_in_slot(slot_json) # Let's check that all builds have completed without errors for plat in ALL_PLATFORMS: for proj, meta in active_projects.items(): if "job_status" not in meta: meta["job_status"] = {} # unless we know otherwise, consider the job as timed out meta["job_status"][plat] = "tests_timeout" # test if build failed try: # access can cause a KeyError if not finished # if we have errors > 0, we also raise error if slot_json["builds"][plat][proj]["errors"]: raise KeyError except KeyError: # ignore if it's one of the problem projects if plat in BUILD_EXCEPTIONS.get(proj, {}): # we won't check tests for a platform that can't build so remember this log.warning(f"{proj}/{plat}: ignoring build failure!") meta["job_status"][plat] = "build_failed" continue handle_processing_error(f"Failed or unfinished build of {proj} on {plat}!", force, log) continue if meta["no_tests"]: continue try: # If "completed" is set, we check if any tests failed # Otherwise we have 3 possible outcomes: # 1. KeyError on access -> entry isn't in couchdb because test job hasn't started. # - It may happen that a job never starts because the system is overloaded. Then the project entry # is missing (forever). In that case, we consider this as a timeout (go to the "else" clause). # 2. completed == "null" -> test job hasn't completed yet # 3. completed == "null" because the test job timed out / crashed and couldn't update couchdb # 2. & 3. can be differentiated by checking when the job started. If started > 24h before now # then the job timed out. This will be remembered for the checking of references and tests later # but we don't abort the entire job. # If the start is more recent, we quit here in the same manner as for 1. if slot_json["tests"][plat].get(proj, {}).get("completed"): meta["job_status"][plat] = ( "tests_failed" if "FAIL" in slot_json["tests"][plat][proj]["results"] else "tests_passed" ) else: checkout_started = slot_json["checkout"]["started"] # slot started over 14h ago, we know it's a timeout timed_out = datetime.now() - datetime.strptime( checkout_started, "%Y-%m-%dT%H:%M:%S.%f" ) > timedelta(hours=14) # if time delta is < 14h try and check via the jenkins api if the job # is actually still running if timed_out or not jenkins_job_is_running(slot_json["tests"][plat][proj]["build_url"]): # test job was killed. Maybe timeout, but other crashes are also possible log.warning(f"{proj}/{plat}: Test job did not finish") continue raise KeyError except KeyError: # if we are here it should mean that a tests job hasn't completed yet handle_processing_error(f"It seems that the tests of {proj} on {plat} are not finished!", force, log) # Projects we want to process should have tests return {k: v for (k, v) in active_projects.items() if (not v["no_tests"])} def process_test_xml( slot_name: str, slot_id: int, project: str, platform: str, meta: JsonDict, force: bool, ) -> None: """Download and parse Test.xml and populate the passed dict {meta} with information we need from the xml Download the Test.xml for slot {slot_name}/{slot_id} for {project} and {platform} Then we extract information on each Test and populate the passed dictionary {meta} """ url = os.path.join(NIGHTLIES_XML_URL, slot_name, str(slot_id), project, platform, "Test.xml") log.debug(f"Processing Test.xml for platform {platform} URL={url}") response = requests.get(url) if not response: log.warning(f"{project}/{platform}: Couldn't access Test.xml at {url}.") return tests = parse_test_xml(response.text, f"{project}/{platform}", force) if "tests" not in meta: meta["tests"] = defaultdict(dict) for name, test_info in tests.items(): # if the build failed, we don't care about details of "how" the test failed # as we don't trust a test from a failing build if meta["job_status"][platform] == "build_failed": test_info["Status"] = "build_failed" # check if test has ref # if not, we can't really do much but warn if it is a failed test if not test_info.get("Output Reference File"): if (ts := test_info["Status"]) not in ["passed", "notrun"]: s = "failed" if ts != "timeout" else "timed out" log.warning(f"{project}/{platform}: Test without ref {s}: {name}!") # non-ref tests can be skipped as we don't do anything fancy for them continue meta["tests"][name].update({platform: test_info}) def parse_test_xml(text: str, warn_prefix: str, force: bool) -> JsonDict: results = {} # create an XML tree to easily traverse test info tree = ET.fromstring(text) for test_element in tree[0].findall("Test"): test = parse_test_xml_test(test_element, force) name = test["Name"] if test["Status"] == "notrun" and test["Completion Status"] == "Fixture dependency failed": log.warning( f"{warn_prefix}: Test {name} was not run because of failed dependencies and is possibly missing a reference update!" ) # no "Causes" but status failed happens for e.g. Boost tests if "Causes" in test: if "unexpected timeout" == test["Causes"]: # debug is enough here as this will cause a "missing platform for ref xyz" warning later log.debug(f"Test {name} timed out!") test["Status"] = "timeout" # probably not that urgent because it almost never happens if "missing error reference file" in test["Causes"]: handle_processing_error( f"Test {name} is missing a error reference file!\n" f"RefBot can't handle this, manual intervention needed!", force, log, ) continue # FIXME this means something is still wrong, I don't think we should update refs # incomplete ref updates are garbage, so is this reason enough to abort completely? # I think so, but it's late so I'm leaving this for a later version of myself to decide if "exit code" in test["Causes"] or "WARNING" in test["Causes"]: # debug is enough here as this will cause a "missing platform for ref xyz" warning later # NOTE If we have a test error in all platforms we will completely ignore it, but # the nightlies will still have a ref for them. So expect to see a warning about downloading # more ref files than we expect log.warning(f"{warn_prefix}: Test {name} contains nonzero exit code and/or warnings!") test["Status"] = "error" results[name] = test return results def parse_test_xml_test(test_element: ET.Element, force: bool) -> JsonDict: FIELDS = [ "1DProfilesMismatch", "1DHistogramsMismatch", "Causes", "CountersMismatch", "Output Reference File", "New Output Reference File", "Output Diff", "Completion Status", # Pytest temporal compatibility: "Test.test_fixture_setup.reference_file", ] if (name_elem := test_element.find("Name")) is None or name_elem.text is None: handle_processing_error("Loop over xml found Test entry without 'Name' field", force, log) name = "NOT-FOUND" else: name = name_elem.text if (path_elem := test_element.find("Path")) is None or path_elem.text is None: handle_processing_error("Loop over xml found Test entry without 'Path' field", force, log) path = "NOT-FOUND" else: path = path_elem.text ret = { "Name": name, "Status": test_element.attrib["Status"], } for field in FIELDS: tmp = test_element.find(f"./Results/NamedMeasurement[@name='{field}']/Value") # for some reason xml element objects convert to False # so we have to explicitly check for != None if tmp is not None and tmp.text is not None: ret[field] = tmp.text for field in ["Output Reference File", "New Output Reference File"]: if field in ret: ret[field] = os.path.normpath(os.path.join(path, ret[field])) # Pytest temporal compatibility: if "Test.test_fixture_setup.reference_file" in ret: ret["Output Reference File"] = ret["Test.test_fixture_setup.reference_file"] ret["New Output Reference File"] = ret["Test.test_fixture_setup.reference_file"] + ".new" return ret def get_unclean_projects( projects: JsonDict, mr_slot_meta: JsonDict, force: bool, ) -> List[str]: """return list of project names which need detailed checking""" ret = [] mr_slot_mrs = slot_merges(projects) for proj, meta in projects.items(): if not can_be_affected(proj, [mr[0] for mr in mr_slot_mrs], mr_slot_meta): log.info(f"{proj}: Cannot be affected by any of {mr_slot_mrs}. Skipping further processing.") continue if all([meta["job_status"][p] == "tests_passed" for p in ALL_PLATFORMS]): log.info(f"{proj}: All platforms passed. Skipping further processing.") continue if all([meta["job_status"][p] == "build_failed" for p in ALL_PLATFORMS]): log.warning(f"{proj}: Build failed on all platforms. Skipping project.") continue # If we get here, we need to download and parse the Test.xml for each platform # of our project to figure out which tests passed/failed/timed out etc. log.info(f"{proj}: needs detailed checking, processing Test.xml.") ret.append(proj) for plat in ALL_PLATFORMS: # if entire jenkins job timed out or build failed we don't need to get the Test.xml # we just treat the entire thing as a failed platform if (ps := meta["job_status"][plat]) == "tests_timeout" or ps == "build_failed": continue # this call will populate the dict "meta" with the additional information we need process_test_xml(mr_slot_meta["slot"], mr_slot_meta["build_id"], proj, plat, meta, force) return ret def check_projects( projects: JsonDict, projects_to_check: List[str], force: bool, ) -> Dict[str, Tuple[Set[str], List[str], List[str]]]: ret = {} for proj in projects_to_check: log.debug(f"checking {proj}") # we keep track of the minimal set of platforms for wich we later need to download new refs plats_to_pull_refs = set() # we track every ref we want to update to later ensure that they are contained withing the downloaded ones refs_to_update = [] new_refs = [] # we know some plats are missing from our checking in slot_is_ready() # so we only excpect to see the following platforms required_plats = { k for (k, v) in projects[proj]["job_status"].items() if v != "tests_timeout" and v != "build_failed" } try: tests = projects[proj]["tests"].items() except KeyError: # this means we do not have tests for a project, # but it's not a blocker if we force if force: log.warning("no test results found for project %s", proj) tests = [] else: raise # test_name is the name of the test # test_by_plats is a dict that holds the test results for each platform for test_name, test_by_plats in tests: # We want to check that a failed test that shares a ref failed in the same way # on all platforms. # NOTE don't forget that v4 tests can share a v3 ref. # so collect results by key of "reference file name" # Sanity check for missing tests # e.g. if a test fails in a really weird way we might potentially ignore it because # it has missing fields in its CTest output so let's issue an extra warning. # After all, our little bot is trying to be as helpful as possible if required_plats != (tps := set(test_by_plats.keys())): log.warning( f"{proj}: Warning, results for test: {test_name} not found for platforms: {required_plats - tps}" ) # sanity check if tps - required_plats: log.warning( f"{proj} buggy logic, this shouldn't happen! more platforms here than required!? {tps-required_plats}" ) # all tests passed or passed + timeouts/notrun etc. # -> nothing to do so we can skip to next test if all([t["Status"] != "failed" for t in test_by_plats.values()]): problem_plats = [(p, t["Status"]) for (p, t) in test_by_plats.items() if t["Status"] != "passed"] if problem_plats: log.warning( f"{proj}: Correctness for treatment of {test_name} not guaranteed." f" Test platforms with unclear statuses:\n{pformat(problem_plats)}" ) continue # create a dict from ref-file -> [platforms using it] ref_sets: Dict[str, List[str]] = defaultdict(list) for plat, test_info in test_by_plats.items(): ref_name = test_info["Output Reference File"] # Skip ref file update if file name contains certain strings ... if not skip_ref_update(plat, ref_name): ref_sets[ref_name].append(plat) # for each ref, check if we need to update # if yes, ensure it's the same update for each platform that uses this test for ref_name, plats in ref_sets.items(): statuses = [test_by_plats[p]["Status"] for p in plats] if all([s == "passed" for s in statuses]): continue # if no tests are 'passed' or 'failed' we don't have any that # even finished correctly. # FIXME can I get complete list of statuses from CTest # I know, 'timeout', 'notrun' 'build_failed', anything else? if all([s != "failed" and s != "passed" for s in statuses]): msg = f"{proj}: Update of {ref_name} not possible. No successful test platforms:\n{pformat(list(zip(plats, statuses)))}" # FIXME: Ignore the check here for the default (non-detdesc) platforms. if any("detdesc" in p for p in plats): handle_processing_error(msg, force, log) else: log.warning(msg) continue # any notrun, timeout, error -> no guarantee if any([s != "failed" and s != "passed" for s in statuses]): # only printout the platforms which trigger the warning # NOTE functional programming beauty, also did I mention yet that I like Haskell? # First rule of Haskell -> you HAVE TO talk about Haskell all the time problem_plats = list(filter(lambda x: x[1] != "passed" and x[1] != "failed", zip(plats, statuses))) log.warning( f"{proj}: Correctness for treatment of {ref_name} not guaranteed." f" Test platforms with unclear statuses:\n{pformat(problem_plats)}" ) if any([s == "failed" for s in statuses]) and any([s == "passed" for s in statuses]): if any(re.match(p, ref_name) for p in UNSTABLE_TEST_REFS): log.warning( f"Update of {ref_name} skipped, mixed statuses of 'passed' and 'failed' found:\n\n" + pformat(dict(zip(plats, statuses))) ) continue handle_processing_error( ( f"{proj}: Update of {ref_name} not possible, mixed statuses of 'passed' and 'failed' found:\n\n" + pformat(dict(zip(plats, statuses))) ), force, log, ) continue # if we have passed and failed we already know something is going wrong on at least on platform # thus we can't possibly update refs for this test. # We abort. # do we have any failed tests? could be that it was just passed + timeout/error # check all tests which are passed or failed results for equality and remember that this ref needs # to be updated if any([s == "failed" for s in statuses]): tests_to_check = [(test_by_plats[p], p) for p in plats if test_by_plats[p]["Status"] == "failed"] if not all([tests_to_check[0][0] == a[0] for a in tests_to_check[1:]]): # FIXME IMHO this needs to be treated better # we have two scenarios # 1. Counters are printed out with more precision than we test for # so this could fail, even though we would consider the counters ok # 2. Actually drastic differences within the platforms, this is something we would # surely want to catch here and issue a strong warning or even error # We could get rid of 1. by only printing the numbers of counters up to their sensitivity # but this needs changes in LHCbTest # given that we don't want to skip or abort cases like 1. for now we just warn and continue log.warning(f"{proj}: Different test results for {ref_name} on platforms {plats}") (test0, plat0) = tests_to_check[0] log.warning(f"{proj}: Comparing {plat0} to:") for test, plat in tests_to_check[1:]: is_same = test == test0 log.warning(f"{proj}:\t {plat} {'is identical' if is_same else 'differs by:'}") if not is_same: for k, v in test.items(): if k not in test0: log.warning(f"{proj}:\t Key '{k}' not found") elif v != test0[k]: log.warning(f"{proj}:\t Differences in {k}:") # lines that differ are by default those that are not in both sets # that's what set ^ set gives us. # set isn't keeping any order, so we print sorted() of the result to make sure # the lines that belong together stay together. diff = sorted(set(v.splitlines()) ^ set(test0[k].splitlines())) # out of the lines that differ, these belong to the p0 platform p0_lines = "\n".join([line for line in diff if line in test0[k].splitlines()]) # and these to the current (plat) platform plat_lines = "\n".join([line for line in diff if line not in p0_lines]) log.warning(f"{proj}:\t{plat0}:\n{p0_lines[:500] + (p0_lines[500:] and '...')}") log.warning( f"{proj}:\t{plat}:\n{plat_lines[:500] + (plat_lines[500:] and '...')}" ) # check if the tests all fail because of a missing ref file # in that case we keep track of these in the list `new_refs` causes = [t[0]["Causes"] for t in tests_to_check if "Causes" in t[0]] cause_is_missing_ref = [c == "unexpected missing reference file" for c in causes] if any(cause_is_missing_ref): if not all(cause_is_missing_ref): handle_processing_error( f"{proj}: Can't fix missing reference file for {test_name}, different causes found {causes}", force, log, ) continue new_refs.append(tests_to_check[0][0]["Output Reference File"]) v4_plat = ( "x86_64_v4-" + OS_VER + "-" + GCC_VER + "+detdesc-opt+g" if "detdesc" in ref_name else "x86_64_v4-" + OS_VER + "-" + GCC_VER + "-opt+g" ) plats_to_pull_refs.add(v4_plat) plats.remove(v4_plat) # sorted in reverse to prefer the downloads from gcc and opt over dbg, # as they don't have as many timeouts -> I need to download less platforms plat_for_pull = sorted(plats, reverse=True)[0] log.debug(f"Pulling missing ref {ref_name} from {plat_for_pull}") plats_to_pull_refs.add(plat_for_pull) continue # if we get here we want an update of the refs for the corresponding test # sorted in reverse to prefer the downloads from gcc and opt over dbg, as they don't have as many timeouts # -> I need to download less platforms plat_for_pull = sorted(plats, reverse=True)[0] log.debug(f"Pulling update for ref {ref_name} from {plat_for_pull}") plats_to_pull_refs.add(plat_for_pull) refs_to_update.append(ref_name) # if plats_to_pull_refs is not empty we have modified or new refs to handle if plats_to_pull_refs: ret[proj] = (plats_to_pull_refs, refs_to_update, new_refs) return ret def update_references( project: str, mr_id: int, args: argparse.Namespace, ) -> Tuple[List[str], Dict[str, List[str]], str, str]: """Update references based on latest ci-test slot that build the given MR""" log.debug(f"master version of ref bot was triggered with args: {args}") # Out of all slots which include our MR we focus on the lhcb-{target}-mr ones # out of those lets pick the latest one. slot_name = f"lhcb-{args.target}-mr" if not args.build_id: # Get list of all slots which include the given MR # couchdb post allows for a json body to specify the query # it expects a json array named "keys" and then return all entries which match # one of the entries in keys. Note we pass ONE entry which is an array itself # as the VIEW in couchdb has a key with two entries, project ID e.g. "Rec" and # a number identifying the number of the MR resp = rq_post_to_json(MRS_VIEW_URL, json={"keys": [[project, mr_id]]}) build_ids = sorted( [row["value"]["build_id"] for row in resp["rows"] if row["value"]["slot"] == slot_name], reverse=True ) else: build_ids = [args.build_id] for build_id in build_ids: # Query the couchdb to obtain all metadata on the chosen *-mr slot mr_slot_meta = rq_get_to_json(os.path.join(COUCH_DB_SERVER, slot_name + f".{build_id}")) if "aborted" not in mr_slot_meta: break else: raise RuntimeError(f"No (non-aborted) builds exist for {project}/{mr_id}") log.info(f"Using slot {slot_name}/{build_id}") # sanity check that we know all platforms that are configured. found_plats = set(mr_slot_meta["config"]["platforms"]) plats_diff = set(ALL_PLATFORMS) - found_plats # keep track of all messages with lvl >= WARNING, to forward them to GitLab at the end decorate_logger(log, LOG_HISTORY, mr_slot_meta) # works backwards compatible for slots with and without detdesc if plats_diff: handle_processing_error( f"Expected these platforms: {ALL_PLATFORMS}\nBut received these: {found_plats}\nMissing platforms: {plats_diff}", args.force, log, ) # parse the slot metadata to check if the slot is in "OK enough" shape to continue projects = slot_is_ready(mr_slot_meta, args.force) if mrs_have_changed(projects): log.warning( "Some MRs have changed since the launch of the ci-test slot. " "We will continue with the ref update. Correctness not guaranteed!" ) projects_base_commits = get_projects_base_commit(mr_slot_meta) # NOTE: Qualitative overview of the below: # we don't directly do all the work for one project but split it up into 3 steps # each of the 3 steps below is done for all projects before going to the next step. # 1. get_unclean_projects: # - check if a project needs detailed checking, if yes download and parse Test.xml # 2. check_tests: # - go through all Tests and check their failure causes, and other validation criteria # - remember which references need to be updated and figure out which reference # update zip files we need to download # 3. create_ref_update_commit: # - get git project, download ref files, create MR with ref update # - also validate that our expected ref updates from 2. match with what we downloaded. # # # We do those 3 steps separately to make sure that we succeed to perform each step on all projects # first before continuing. I wan't to avoid creating 2 update MRs and then e.g. throwing an exception # on the next project in step 2. which would mean we would have to undo the MRs etc. # step 1 projects_to_check = get_unclean_projects(projects, mr_slot_meta, args.force) # step 2 projects_to_update = check_projects(projects, projects_to_check, args.force) # step 3 # we parse the list of projects, and based on the MRs we find we create # the branch name and title for the gitlab MRs we need to create for the ref updates gitlab_br_name, gitlab_title = make_gitlab_mr_branch_name_and_title(mr_slot_meta) gitlab_title = f"{gitlab_title} based on {slot_name}/{build_id}" # FIXME: temporary hack to address https://gitlab.cern.ch/lhcb-rta/reference-update-bot/-/issues/20 # while waiting for a proper fix def target_branch_for_project_in_slot(project: str, slot: str) -> str: # I could use args.target, but the actual logic should deduce the # branch from the slot branch = slot.replace("lhcb-", "").replace("-mr", "") if branch == "2024-patches" and project in ("Boole", "Online"): branch = "master" elif branch == "sim10": branch = "Sim10" if project == "Gauss" else "sim10-patches" return branch projects_to_push = [ create_ref_update_commit( projects_base_commits[proj], slot_name, build_id, proj, target_branch_for_project_in_slot(proj, slot_name), plats_to_pull_refs, refs_to_update, new_refs, gitlab_br_name, gitlab_title, ) for proj, (plats_to_pull_refs, refs_to_update, new_refs) in projects_to_update.items() ] projects_to_push = list(filter(None, projects_to_push)) log.info(f"Successfully updated references for: {projects_to_push}") return projects_to_push, LOG_HISTORY, gitlab_br_name, gitlab_title