From 7620007f32fd414a3e6f0c34af58db3eb295f1ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jelmer=20Vernoo=C4=B3?= <jelmer@jelmer.uk>
Date: Sat, 28 Jan 2023 14:50:02 +0000
Subject: [PATCH] Integrate success and duration determination

---
 janitor/schedule.py | 122 +++++++++++++++++++++++---------------------
 janitor/site/pkg.py |   4 +-
 2 files changed, 66 insertions(+), 60 deletions(-)

diff --git a/janitor/schedule.py b/janitor/schedule.py
index 580d8ec5..861189cd 100644
--- a/janitor/schedule.py
+++ b/janitor/schedule.py
@@ -103,9 +103,48 @@ def queue_item_from_candidate_and_publish_policy(row):
             value, row['success_chance'])
 
 
-async def estimate_success_probability(
+async def _estimate_duration(
+    conn: asyncpg.Connection,
+    codebase: Optional[str] = None,
+    campaign: Optional[str] = None,
+) -> Optional[timedelta]:
+    query = """
+SELECT AVG(finish_time - start_time) FROM run
+WHERE failure_transient is not True """
+    args: list[str] = []
+    if codebase is not None:
+        query += " AND codebase = $%d" % (len(args) + 1)
+        args.append(codebase)
+    if campaign is not None:
+        query += " AND suite = $%d" % (len(args) + 1)
+        args.append(campaign)
+    return await conn.fetchval(query, *args)
+
+
+async def estimate_duration(
+    conn: asyncpg.Connection, codebase: str, campaign: str
+) -> timedelta:
+    """Estimate the duration of a codebase build for a certain campaign."""
+    estimated_duration = await _estimate_duration(
+        conn, codebase=codebase, campaign=campaign
+    )
+    if estimated_duration is not None:
+        return estimated_duration
+
+    estimated_duration = await _estimate_duration(conn, codebase=codebase)
+    if estimated_duration is not None:
+        return estimated_duration
+
+    estimated_duration = await _estimate_duration(conn, campaign=campaign)
+    if estimated_duration is not None:
+        return estimated_duration
+
+    return timedelta(seconds=DEFAULT_ESTIMATED_DURATION)
+
+
+async def estimate_success_probability_and_duration(
     conn: asyncpg.Connection, codebase: str, campaign: str, context: Optional[str] = None
-) -> tuple[float, int]:
+) -> tuple[float, timedelta, int]:
     # TODO(jelmer): Bias this towards recent runs?
     total = 0
     success = 0
@@ -113,22 +152,25 @@ async def estimate_success_probability(
         same_context_multiplier = 0.5
     else:
         same_context_multiplier = 1.0
+    durations = []
     for run in await conn.fetch("""
 SELECT
-  result_code, instigated_context, context, failure_details, failure_transient,
-  start_time
+  result_code, instigated_context, context, failure_details,
+  finish_time - start_time AS duration
 FROM run
-WHERE codebase = $1 AND suite = $2
+WHERE codebase = $1 AND suite = $2 AND failure_transient IS NOT True
 ORDER BY start_time DESC
 """, codebase, campaign):
         try:
             ignore_checker = IGNORE_RESULT_CODE[run['result_code']]
         except KeyError:
             def ignore_checker(run):
-                return run['failure_transient']
+                return False
 
         if ignore_checker(run):
             continue
+
+        durations.append(run['duration'])
         total += 1
         if run['result_code'] == "success":
             success += 1
@@ -148,54 +190,18 @@ def ignore_checker(run):
         # we don't know the context.
         same_context_multiplier = 1.0
 
-    return ((success * 10 + 1) / (total * 10 + 1) * same_context_multiplier), total
-
-
-async def _estimate_duration(
-    conn: asyncpg.Connection,
-    codebase: Optional[str] = None,
-    campaign: Optional[str] = None,
-    limit: Optional[int] = 1000,
-) -> Optional[timedelta]:
-    query = """
-SELECT AVG(duration) FROM
-(select finish_time - start_time as duration FROM run
-WHERE """
-    args = []
-    if codebase is not None:
-        query += " codebase = $1"
-        args.append(codebase)
-    if campaign is not None:
-        if codebase:
-            query += " AND"
-        query += " suite = $%d" % (len(args) + 1)
-        args.append(campaign)
-    query += " ORDER BY finish_time DESC"
-    if limit is not None:
-        query += " LIMIT %d" % limit
-    query += ") as q"
-    return await conn.fetchval(query, *args)
-
-
-async def estimate_duration(
-    conn: asyncpg.Connection, codebase: str, campaign: str
-) -> timedelta:
-    """Estimate the duration of a codebase build for a certain campaign."""
-    estimated_duration = await _estimate_duration(
-        conn, codebase=codebase, campaign=campaign
-    )
-    if estimated_duration is not None:
-        return estimated_duration
-
-    estimated_duration = await _estimate_duration(conn, codebase=codebase)
-    if estimated_duration is not None:
-        return estimated_duration
-
-    estimated_duration = await _estimate_duration(conn, campaign=campaign)
-    if estimated_duration is not None:
-        return estimated_duration
+        # It's going to be hard to estimate the duration, but other codemods
+        # might be a good candidate
+        estimated_duration = await _estimate_duration(conn, codebase=codebase)
+        if estimated_duration is None:
+            estimated_duration = await _estimate_duration(conn, campaign=campaign)
+        if estimated_duration is None:
+            estimated_duration = timedelta(seconds=DEFAULT_ESTIMATED_DURATION)
+    else:
+        estimated_duration = timedelta(
+            seconds=(sum([d.total_seconds() for d in durations]) / len(durations)))
 
-    return timedelta(seconds=DEFAULT_ESTIMATED_DURATION)
+    return ((success * 10 + 1) / (total * 10 + 1) * same_context_multiplier), estimated_duration, total
 
 
 # Overhead of doing a run; estimated to be roughly 20s
@@ -288,14 +294,14 @@ async def do_schedule_regular(
             context = row['context']
         if row is not None and command is None:
             command = row['command']
-    estimated_duration = await estimate_duration(conn, codebase, campaign)
-    assert estimated_duration >= timedelta(
-        0
-    ), "{}: estimated duration < 0.0: {!r}".format(codebase, estimated_duration)
     (
         estimated_probability_of_success,
+        estimated_duration,
         total_previous_runs,
-    ) = await estimate_success_probability(conn, codebase, campaign, context)
+    ) = await estimate_success_probability_and_duration(conn, codebase, campaign, context)
+
+    assert estimated_duration >= timedelta(0), \
+        f"{codebase}: estimated duration < 0.0: {estimated_duration!r}"
 
     if normalized_codebase_value is None:
         normalized_codebase_value = await conn.fetchval(
diff --git a/janitor/site/pkg.py b/janitor/site/pkg.py
index 14bf469a..6238bd63 100644
--- a/janitor/site/pkg.py
+++ b/janitor/site/pkg.py
@@ -134,7 +134,7 @@ async def generate_run_file(
         differ_url: Optional[str], publisher_url: Optional[str], logfile_manager, run,
         vcs_managers: Dict[str, VcsManager], is_admin, span
 ):
-    from ..schedule import estimate_success_probability
+    from ..schedule import estimate_success_probability_and_duration
     kwargs = {}
     kwargs["run"] = run
     kwargs["run_id"] = run['id']
@@ -166,7 +166,7 @@ async def generate_run_file(
                 'FROM review WHERE run_id = $1',
                 run['id'])
         with span.new_child('sql:success-probability'):
-            kwargs["success_probability"], kwargs["total_previous_runs"] = await estimate_success_probability(
+            kwargs["success_probability"], kwargs['estimated_duration'], kwargs["total_previous_runs"] = await estimate_success_probability_and_duration(
                 conn, run['package'], run['suite'])
         with span.new_child('sql:followups'):
             kwargs['followups'] = await conn.fetch("""SELECT \