%pip uninstall --yes 'keras' 'matplotlib' 'scikit-learn' 'tensorflow'

Found existing installation: keras 3.10.0
Uninstalling keras-3.10.0:
  Successfully uninstalled keras-3.10.0
Found existing installation: matplotlib 3.10.0
Uninstalling matplotlib-3.10.0:
  Successfully uninstalled matplotlib-3.10.0
Found existing installation: scikit-learn 1.6.1
Uninstalling scikit-learn-1.6.1:
  Successfully uninstalled scikit-learn-1.6.1
Found existing installation: tensorflow 2.19.0
Uninstalling tensorflow-2.19.0:
  Successfully uninstalled tensorflow-2.19.0
Note: you may need to restart the kernel to use updated packages.

import warnings
warnings.simplefilter('ignore')

import os
import sys
import subprocess

def set_env(input_archive, temp_dir):

    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir, exist_ok=True)
        
        subprocess.run(['tar', '-xzf', input_archive, '-C', temp_dir], check=True)
    
    subprocess.run([
        sys.executable, 
        '-m', 
        'pip', 
        'install', 
        '--q',
        '--no-index', 
        '--find-links', 
        f'{temp_dir}/wheels', 
        'unsloth', 
        'trl', 
        'vllm', 
        'openai_harmony'
    ], check=True)

set_env(
    input_archive='/kaggle/input/aimo-3-utils/wheels.tar.gz', 
    temp_dir='/kaggle/tmp/setup'
)

subprocess.run(['ls', '/kaggle/tmp/setup/tiktoken_encodings'])

cl100k_base.tiktoken
o200k_base.tiktoken

CompletedProcess(args=['ls', '/kaggle/tmp/setup/tiktoken_encodings'], returncode=0)

os.environ['TRANSFORMERS_NO_TF'] = '1'
os.environ['TRANSFORMERS_NO_FLAX'] = '1'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['TRITON_PTXAS_PATH'] = '/usr/local/cuda/bin/ptxas'
os.environ['TIKTOKEN_ENCODINGS_BASE'] = '/kaggle/tmp/setup/tiktoken_encodings'

import gc
import re
import math
import time
import queue
import threading
import contextlib
from typing import Optional
from jupyter_client import KernelManager
from collections import Counter, defaultdict
from concurrent.futures import as_completed, ThreadPoolExecutor

import pandas as pd
import polars as pl
from openai import OpenAI
from openai_harmony import (
    HarmonyEncodingName, load_harmony_encoding,
    SystemContent, ReasoningEffort, ToolNamespaceConfig,
    Author, Message, Role, TextContent, Conversation
)
from transformers import set_seed
import kaggle_evaluation.aimo_3_inference_server

# ─────────────────────────────────────────────────────────────────────────────
# CFG
# ─────────────────────────────────────────────────────────────────────────────

class CFG:

    system_prompt = (
        'You are an elite mathematical problem solver with expertise at the International '
        'Mathematical Olympiad (IMO) level. Your goal is to find the correct answer through '
        'rigorous mathematical reasoning.\n\n'

        '# Problem-Solving Approach:\n'
        '1. UNDERSTAND: Carefully read and rephrase the problem in your own words. '
        'Identify what is given, what needs to be found, and any constraints.\n'
        '2. EXPLORE: Consider multiple solution strategies. Think about relevant theorems, '
        'techniques, patterns, or analogous problems. Don\'t commit to one approach immediately.\n'
        '3. PLAN: Select the most promising approach and outline key steps before executing.\n'
        '4. EXECUTE: Work through your solution methodically. Show all reasoning steps clearly.\n'
        '5. VERIFY: Check your answer by substituting back, testing edge cases, or using '
        'alternative methods. Ensure logical consistency throughout.\n\n'

        '# Mathematical Reasoning Principles:\n'
        '- Break complex problems into smaller, manageable sub-problems\n'
        '- Look for patterns, symmetries, and special cases that provide insight\n'
        '- Use concrete examples to build intuition before generalizing\n'
        '- Consider extreme cases and boundary conditions\n'
        '- If stuck, try working backwards from the desired result\n'
        '- Be willing to restart with a different approach if needed\n\n'

        '# Verification Requirements:\n'
        '- Cross-check arithmetic and algebraic manipulations\n'
        '- Verify that your solution satisfies all problem constraints\n'
        '- Test your answer with simple cases or special values when possible\n'
        '- Ensure dimensional consistency and reasonableness of the result\n\n'

        '# Output Format:\n'
        'The final answer must be a non-negative integer between 0 and 99999.\n'
        'Place your final numerical answer inside \\boxed{}, e.g., \\boxed{42}\n\n'

        'Think step-by-step and show your complete reasoning process. Quality of reasoning '
        'is as important as the final answer.'
    )

    system_prompt_alt = (
        'You are an elite mathematical problem solver. '
        'Your goal is to find the correct answer through rigorous reasoning.\n\n'

        '# IMPORTANT: Use a DIFFERENT approach than the most obvious one.\n'
        '- If the problem seems to call for algebra, try a combinatorial approach\n'
        '- If it seems combinatorial, try generating small cases computationally\n'
        '- Always verify your answer independently before committing\n'
        '- Be especially suspicious if your answer feels "too clean" or "too round"\n\n'

        '# Mathematical Reasoning Principles:\n'
        '- Break complex problems into smaller, manageable sub-problems\n'
        '- Look for patterns, symmetries, and special cases that provide insight\n'
        '- Use concrete examples to build intuition before generalizing\n'
        '- Consider extreme cases and boundary conditions\n'
        '- If stuck, try working backwards from the desired result\n'
        '- Be willing to restart with a different approach if needed\n\n'

        '# Verification Requirements:\n'
        '- Cross-check arithmetic and algebraic manipulations\n'
        '- Verify that your solution satisfies all problem constraints\n'
        '- Test your answer with simple cases or special values when possible\n'
        '- Ensure dimensional consistency and reasonableness of the result\n\n'

        '# Output Format:\n'
        'The final answer must be a non-negative integer between 0 and 99999.\n'
        'Place your final numerical answer inside \\boxed{}, e.g., \\boxed{42}\n\n'

        'Think step-by-step. Show complete reasoning.'
    )

    system_prompt_verify = (
        'You are an elite mathematical problem solver. '
        'Your goal is to find the correct answer through rigorous reasoning '
        'WITH MANDATORY COMPUTATIONAL VERIFICATION.\n\n'

        '# REQUIRED APPROACH:\n'
        '1. Solve the problem analytically first\n'
        '2. You MUST write Python code to verify your answer by brute force:\n'
        '   - If the problem asks "how many X", enumerate ALL X and count them\n'
        '   - If the problem asks for a maximum/minimum, check all candidates\n'
        '   - Compare your computational result to your analytical result\n'
        '   - If they disagree, trust the computation and re-examine your analysis\n'
        '3. Only submit an answer after computation confirms it\n\n'

        '# Mathematical Reasoning Principles:\n'
        '- Break complex problems into smaller, manageable sub-problems\n'
        '- Look for patterns, symmetries, and special cases that provide insight\n'
        '- Use concrete examples to build intuition before generalizing\n'
        '- Consider extreme cases and boundary conditions\n'
        '- If stuck, try working backwards from the desired result\n'
        '- Be willing to restart with a different approach if needed\n\n'

        '# Verification Requirements:\n'
        '- Cross-check arithmetic and algebraic manipulations\n'
        '- Verify that your solution satisfies all problem constraints\n'
        '- ALWAYS run a brute-force enumeration to confirm the final answer\n'
        '- If computation and analysis disagree, resolve before answering\n\n'

        '# Output Format:\n'
        'The final answer must be a non-negative integer between 0 and 99999.\n'
        'Place your final numerical answer inside \\boxed{}, e.g., \\boxed{42}\n\n'

        'Think step-by-step. Verify computationally. Never skip the brute-force check.'
    )

    tool_prompt = (
        'Use this tool to execute Python code for:\n'
        '- Complex calculations that would be error-prone by hand\n'
        '- Numerical verification of analytical results\n'
        '- Generating examples or testing conjectures\n'
        '- Visualizing problem structure when helpful\n'
        '- Brute-force verification for small cases\n\n'

        'The environment is a stateful Jupyter notebook. Code persists between executions.\n'
        'Always use print() to display results. Write clear, well-commented code.\n\n'

        'Remember: Code should support your mathematical reasoning, not replace it. '
        'Explain what you\'re computing and why before running code.'
    )

    preference_prompt = (
        'You have access to `math`, `numpy`, and `sympy` for:\n\n'

        '# Symbolic Computation (sympy):\n'
        '- Algebraic manipulation and simplification\n'
        '- Solving equations and systems of equations\n'
        '- Symbolic differentiation and integration\n'
        '- Number theory functions (primes, divisors, modular arithmetic)\n'
        '- Polynomial operations and factorization\n'
        '- Working with mathematical expressions symbolically\n\n'

        '# Numerical Computation (numpy):\n'
        '- Array operations and linear algebra\n'
        '- Efficient numerical calculations for large datasets\n'
        '- Matrix operations and eigenvalue problems\n'
        '- Statistical computations\n\n'

        '# Mathematical Functions (math):\n'
        '- Standard mathematical functions (trig, log, exp)\n'
        '- Constants like pi and e\n'
        '- Basic operations for single values\n\n'

        'Best Practices:\n'
        '- Use sympy for exact symbolic answers when possible\n'
        '- Use numpy for numerical verification and large-scale computation\n'
        '- Combine symbolic and numerical approaches: derive symbolically, verify numerically\n'
        '- Document your computational strategy clearly\n'
        '- Validate computational results against known cases or theoretical bounds'
    )

    served_model_name = 'gpt-oss'
    model_path        = '/kaggle/input/gpt-oss-120b/transformers/default/1'

    kv_cache_dtype = 'fp8_e4m3'
    dtype          = 'auto'

    high_problem_timeout = 900
    base_problem_timeout = 300
    notebook_limit       = 17400
    server_timeout       = 180
    session_timeout      = 960
    jupyter_timeout      = 6
    sandbox_timeout      = 3

    stream_interval = 200
    context_tokens  = 65536
    buffer_tokens   = 512
    search_tokens   = 32
    top_logprobs    = 5
    batch_size      = 256

    # ── Convergence parameters ────────────────────────────────────────
    # Three conditions must ALL be satisfied to stop:
    #   1. top_count      >= early_stop          (absolute vote strength)
    #   2. lead           >= min_lead_*           (gap between top and second)
    #   3. total_valid    >= min_valid_answers    (enough data to trust lead)
    #
    # min_valid_answers replaces both the old min_valid_for_lead AND
    # min_attempts_before_early_stop — they were the same concept.

    early_stop             = 5   # top answer needs at least 5 votes
    min_lead_first_batch   = 3   # lead required to stop in batch 1
    min_lead_later_batches = 2   # lead required to stop in batch 2+
    min_valid_answers      = 6   # min valid answers before any stop fires

    attempts = 9     # attempts per batch (every batch)
    workers  = 16
    turns    = 128
    seed     = 42

    # Per-position temperature — same schedule reused every batch
    attempt_temperatures = [
        0.2,  # position 0 — standard
        0.2,  # position 1 — standard
        0.2,  # position 2 — standard
        0.2,  # position 3 — verify
        0.6,  # position 4 — verify
        0.6,  # position 5 — verify
        0.6,  # position 6 — alt
        0.6,  # position 7 — alt
        0.6,  # position 8 — alt
    ]

    # Prompt assignment per position within each batch
    # verify takes priority over alt
    # position 0,1,2 → standard
    # position 3,4,5   → verify  (brute-force enumeration required)
    # position 6,7,8 → alt   (different approach required)
    verify_prompt_attempts = {3, 4, 5}
    alt_prompt_attempts    = {6, 7, 8}

    # Extended time budget: 4x base, capped at 4x high_problem_timeout
    budget_multiplier    = 3
    extended_timeout_cap = high_problem_timeout * 3  # 2700s

    gpu_memory_utilization = 0.96
    min_p                  = 0.02


set_seed(CFG.seed)


# ─────────────────────────────────────────────────────────────────────────────
# Template
# ─────────────────────────────────────────────────────────────────────────────

class AIMO3Template:

    def __init__(self):
        pass

    def get_system_content(
        self,
        system_prompt: str,
        tool_config: ToolNamespaceConfig
    ) -> SystemContent:
        return (
            SystemContent.new()
            .with_model_identity(system_prompt)
            .with_reasoning_effort(reasoning_effort=ReasoningEffort.HIGH)
            .with_tools(tool_config)
        )

    def apply_chat_template(
        self,
        system_prompt: str,
        user_prompt: str,
        tool_config: ToolNamespaceConfig
    ) -> list[Message]:
        system_content = self.get_system_content(system_prompt, tool_config)
        system_message = Message.from_role_and_content(Role.SYSTEM, system_content)
        user_message   = Message.from_role_and_content(Role.USER, user_prompt)
        return [system_message, user_message]


# ─────────────────────────────────────────────────────────────────────────────
# Sandbox
# ─────────────────────────────────────────────────────────────────────────────

class AIMO3Sandbox:

    _port_lock = threading.Lock()
    _next_port = 50000

    @classmethod
    def _get_next_ports(cls, count: int = 5) -> list[int]:
        with cls._port_lock:
            ports = list(range(cls._next_port, cls._next_port + count))
            cls._next_port += count
            return ports

    def __init__(self, timeout: float):
        self._default_timeout = timeout
        self._owns_kernel     = False
        self._client          = None
        self._km              = None

        ports = self._get_next_ports(5)

        env = os.environ.copy()
        env['PYDEVD_DISABLE_FILE_VALIDATION'] = '1'
        env['PYDEVD_WARN_EVALUATION_TIMEOUT'] = '0'
        env['JUPYTER_PLATFORM_DIRS']          = '1'
        env['PYTHONWARNINGS']                 = 'ignore'
        env['MPLBACKEND']                     = 'Agg'

        self._km = KernelManager()
        self._km.shell_port   = ports[0]
        self._km.iopub_port   = ports[1]
        self._km.stdin_port   = ports[2]
        self._km.hb_port      = ports[3]
        self._km.control_port = ports[4]

        self._km.start_kernel(
            env=env,
            extra_arguments=['--Application.log_level=CRITICAL']
        )

        self._client = self._km.blocking_client()
        self._client.start_channels()
        self._client.wait_for_ready(timeout=self._default_timeout)
        self._owns_kernel = True

        self.execute(
            'import math\n'
            'import numpy\n'
            'import sympy\n'
            'import itertools\n'
            'import collections\n'
            'import mpmath\n'
            'mpmath.mp.dps = 64\n'
        )

    def _format_error(self, traceback: list[str]) -> str:
        clean_lines = []
        for frame in traceback:
            clean_frame = re.sub(r'\x1b\[[0-9;]*m', '', frame)
            if 'File "' in clean_frame and 'ipython-input' not in clean_frame:
                continue
            clean_lines.append(clean_frame)
        return ''.join(clean_lines)

    def execute(self, code: str, timeout: float | None = None) -> str:
        client            = self._client
        effective_timeout = timeout or self._default_timeout

        msg_id = client.execute(
            code,
            store_history=True,
            allow_stdin=False,
            stop_on_error=False
        )

        stdout_parts = []
        stderr_parts = []
        start_time   = time.time()

        while True:
            if time.time() - start_time > effective_timeout:
                self._km.interrupt_kernel()
                return f'[ERROR] Execution timed out after {effective_timeout} seconds'

            try:
                msg = client.get_iopub_msg(timeout=1.0)
            except queue.Empty:
                continue

            if msg.get('parent_header', {}).get('msg_id') != msg_id:
                continue

            msg_type = msg.get('msg_type')
            content  = msg.get('content', {})

            if msg_type == 'stream':
                text = content.get('text', '')
                if content.get('name') == 'stdout':
                    stdout_parts.append(text)
                else:
                    stderr_parts.append(text)
            elif msg_type == 'error':
                stderr_parts.append(
                    self._format_error(content.get('traceback', []))
                )
            elif msg_type in {'execute_result', 'display_data'}:
                data = content.get('data', {})
                text = data.get('text/plain')
                if text:
                    stdout_parts.append(
                        text if text.endswith('\n') else f'{text}\n'
                    )
            elif msg_type == 'status':
                if content.get('execution_state') == 'idle':
                    break

        stdout = ''.join(stdout_parts)
        stderr = ''.join(stderr_parts)

        if stderr:
            return f'{stdout.rstrip()}\n{stderr}' if stdout else stderr
        return stdout if stdout.strip() else '[WARN] No output. Use print() to see results.'

    def close(self):
        with contextlib.suppress(Exception):
            if self._client:
                self._client.stop_channels()
        if self._owns_kernel and self._km is not None:
            with contextlib.suppress(Exception):
                self._km.shutdown_kernel(now=True)
            with contextlib.suppress(Exception):
                self._km.cleanup_resources()

    def reset(self):
        self.execute(
            '%reset -f\n'
            'import math\n'
            'import numpy\n'
            'import sympy\n'
            'import itertools\n'
            'import collections\n'
            'import mpmath\n'
            'mpmath.mp.dps = 64\n'
        )

    def __del__(self):
        self.close()


# ─────────────────────────────────────────────────────────────────────────────
# Tool
# ─────────────────────────────────────────────────────────────────────────────

class AIMO3Tool:

    def __init__(
        self,
        local_jupyter_timeout: float,
        tool_prompt: str,
        sandbox=None
    ):
        self._local_jupyter_timeout = local_jupyter_timeout
        self._tool_prompt           = tool_prompt
        self._jupyter_session       = sandbox
        self._owns_session          = sandbox is None
        self._execution_lock        = threading.Lock()
        self._init_lock             = threading.Lock()

    def _ensure_session(self):
        if self._jupyter_session is None:
            with self._init_lock:
                if self._jupyter_session is None:
                    self._jupyter_session = AIMO3Sandbox(
                        timeout=self._local_jupyter_timeout
                    )

    def _ensure_last_print(self, code: str) -> str:
        lines     = code.strip().split('\n')
        if not lines:
            return code
        last_line = lines[-1].strip()
        if 'print' in last_line or 'import' in last_line:
            return code
        if not last_line or last_line.startswith('#'):
            return code
        lines[-1] = 'print(' + last_line + ')'
        return '\n'.join(lines)

    @property
    def instruction(self) -> str:
        return self._tool_prompt

    @property
    def tool_config(self) -> ToolNamespaceConfig:
        return ToolNamespaceConfig(
            name='python',
            description=self.instruction,
            tools=[]
        )

    def _make_response(self, output: str, channel: str | None = None) -> Message:
        content = TextContent(text=output)
        author  = Author(role=Role.TOOL, name='python')
        message = Message(
            author=author, content=[content]
        ).with_recipient('assistant')
        if channel:
            message = message.with_channel(channel)
        return message

    def process_sync_plus(self, message: Message) -> list[Message]:
        self._ensure_session()
        raw_script   = message.content[0].text
        final_script = self._ensure_last_print(raw_script)
        with self._execution_lock:
            try:
                output = self._jupyter_session.execute(final_script)
            except TimeoutError as exc:
                output = f'[ERROR] {exc}'
        return [self._make_response(output, channel=message.channel)]


# ─────────────────────────────────────────────────────────────────────────────
# Solver
# ─────────────────────────────────────────────────────────────────────────────

class AIMO3Solver:

    def __init__(self, cfg, port: int = 8000):
        self.cfg      = cfg
        self.port     = port
        self.base_url = f'http://0.0.0.0:{port}/v1'
        self.api_key  = 'sk-local'
        self.template = AIMO3Template()
        self.encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
        self.stop_token_ids = self.encoding.stop_tokens_for_assistant_actions()

        self._preload_model_weights()
        self.server_process = self._start_server()

        self.client = OpenAI(
            base_url=self.base_url,
            api_key=self.api_key,
            timeout=self.cfg.session_timeout
        )

        self._wait_for_server()
        self._initialize_kernels()

        self.notebook_start_time = time.time()
        self.problems_remaining  = 50

    # ── Model loading & server ────────────────────────────────────────

    def _preload_model_weights(self) -> None:
        print(f'Loading model weights from {self.cfg.model_path} into OS Page Cache...')
        start_time    = time.time()
        files_to_load = []
        total_size    = 0

        for root, _, files in os.walk(self.cfg.model_path):
            for file_name in files:
                file_path = os.path.join(root, file_name)
                if os.path.isfile(file_path):
                    files_to_load.append(file_path)
                    total_size += os.path.getsize(file_path)

        def _read_file(path: str) -> None:
            with open(path, 'rb') as f:
                while f.read(1024 * 1024 * 1024):
                    pass

        with ThreadPoolExecutor(max_workers=self.cfg.workers) as executor:
            list(executor.map(_read_file, files_to_load))

        elapsed = time.time() - start_time
        print(
            f'Processed {len(files_to_load)} files '
            f'({total_size / 1e9:.2f} GB) in {elapsed:.2f} seconds.\n'
        )

    def _start_server(self) -> subprocess.Popen:
        cmd = [
            sys.executable, '-m', 'vllm.entrypoints.openai.api_server',
            '--seed',                   str(self.cfg.seed),
            '--model',                  self.cfg.model_path,
            '--served-model-name',      self.cfg.served_model_name,
            '--tensor-parallel-size',   '1',
            '--max-num-seqs',           str(self.cfg.batch_size),
            '--gpu-memory-utilization', str(self.cfg.gpu_memory_utilization),
            '--host',                   '0.0.0.0',
            '--port',                   str(self.port),
            '--dtype',                  self.cfg.dtype,
            '--kv-cache-dtype',         self.cfg.kv_cache_dtype,
            '--max-model-len',          str(self.cfg.context_tokens),
            '--stream-interval',        str(self.cfg.stream_interval),
            '--async-scheduling',
            '--disable-log-stats',
            '--enable-prefix-caching',
        ]
        self.log_file = open('vllm_server.log', 'w')
        return subprocess.Popen(
            cmd,
            stdout=self.log_file,
            stderr=subprocess.STDOUT,
            start_new_session=True
        )

    def _wait_for_server(self):
        print('Waiting for vLLM server...')
        start_time = time.time()
        for _ in range(self.cfg.server_timeout):
            return_code = self.server_process.poll()
            if return_code is not None:
                self.log_file.flush()
                with open('vllm_server.log', 'r') as lf:
                    logs = lf.read()
                raise RuntimeError(
                    f'Server died with code {return_code}. Full logs:\n{logs}\n'
                )
            try:
                self.client.models.list()
                elapsed = time.time() - start_time
                print(f'Server is ready (took {elapsed:.2f} seconds).\n')
                return
            except Exception:
                time.sleep(1)
        raise RuntimeError('Server failed to start (timeout).\n')

    def _initialize_kernels(self) -> None:
        print(f'Initializing {self.cfg.workers} persistent Jupyter kernels...')
        start_time        = time.time()
        self.sandbox_pool = queue.Queue()

        def _create_sandbox():
            return AIMO3Sandbox(timeout=self.cfg.jupyter_timeout)

        with ThreadPoolExecutor(max_workers=self.cfg.workers) as executor:
            futures = [
                executor.submit(_create_sandbox)
                for _ in range(self.cfg.workers)
            ]
            for future in as_completed(futures):
                self.sandbox_pool.put(future.result())

        elapsed = time.time() - start_time
        print(f'Kernels initialized in {elapsed:.2f} seconds.\n')

    # ── Utilities ─────────────────────────────────────────────────────

    def _scan_for_answer(self, text: str) -> int | None:
        pattern = r'\\boxed\s*\{\s*([0-9,]+)\s*\}'
        matches = re.findall(pattern, text)
        if matches:
            try:
                value = int(matches[-1].replace(',', ''))
                if 0 <= value <= 99999:
                    return value
            except ValueError:
                pass

        pattern = r'final\s+answer\s+is\s*([0-9,]+)'
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            try:
                value = int(matches[-1].replace(',', ''))
                if 0 <= value <= 99999:
                    return value
            except ValueError:
                pass

        return None

    def _compute_mean_entropy(self, logprobs_buffer: list) -> float:
        if not logprobs_buffer:
            return float('inf')

        total_entropy = 0.0
        token_count   = 0

        for top_logprobs_dict in logprobs_buffer:
            if not isinstance(top_logprobs_dict, dict) or not top_logprobs_dict:
                continue
            token_entropy = 0.0
            for token_str, log_prob in top_logprobs_dict.items():
                prob = math.exp(log_prob)
                if prob > 0:
                    token_entropy -= prob * math.log2(prob)
            total_entropy += token_entropy
            token_count   += 1

        if token_count == 0:
            return float('inf')

        return total_entropy / token_count

    def _compute_lead(self, valid_answers: list) -> tuple[int, int, int]:
        """
        Returns (top_count, second_count, lead).
        second_count = 0 when only one distinct answer exists.
        lead is always an integer, never NaN or None.
        """
        if not valid_answers:
            return 0, 0, 0
        counts       = Counter(valid_answers).most_common()
        top_count    = counts[0][1]
        second_count = counts[1][1] if len(counts) > 1 else 0
        return top_count, second_count, top_count - second_count

    def _has_converged(
        self,
        valid_answers: list,
        is_first_batch: bool,
    ) -> bool:
        """
        Single convergence check used in both:
          - Early stop within a batch
          - While loop exit after a batch completes

        Three conditions must ALL be satisfied:
          1. total_valid >= min_valid_answers   (enough data to trust lead)
          2. top_count   >= early_stop          (absolute vote strength)
          3. lead        >= min_lead_*          (gap: stricter in batch 1)

        min_valid_answers replaces both the old min_valid_for_lead AND
        min_attempts_before_early_stop — they were the same concept.
        """
        top_count, second_count, lead = self._compute_lead(valid_answers)
        total_valid                   = len(valid_answers)

        # Condition 1: enough valid answers to trust any signal
        if total_valid < self.cfg.min_valid_answers:
            return False

        # Condition 2: top answer needs absolute vote strength
        if top_count < self.cfg.early_stop:
            return False

        # Condition 3: lead threshold — stricter in first batch
        min_lead = (
            self.cfg.min_lead_first_batch
            if is_first_batch
            else self.cfg.min_lead_later_batches
        )
        if lead < min_lead:
            return False

        return True

    def _pick_prompt(self, position: int) -> str:
        """
        Returns the system prompt for a given position within a batch.
        verify takes priority over alt.

        Position 0,1,2   → standard  (convergent, rigorous)
        Position 3,4     → verify    (brute-force enumeration required)
        Position 5,6,7,8 → alt       (different approach required)
        """
        if position in self.cfg.verify_prompt_attempts:
            return self.cfg.system_prompt_verify
        elif position in self.cfg.alt_prompt_attempts:
            return self.cfg.system_prompt_alt
        else:
            return self.cfg.system_prompt

    # ── Single attempt ────────────────────────────────────────────────

    def _process_attempt(
        self,
        problem: str,
        system_prompt: str,
        attempt_index: int,
        stop_event: threading.Event,
        deadline: float,
        temperature_override: float | None = None,
    ) -> dict:

        if stop_event.is_set() or time.time() > deadline:
            return {
                'Attempt':         attempt_index,
                'Prompt':          'skipped',
                'Temperature':     temperature_override,
                'Answer':          None,
                'Entropy':         float('inf'),
                'Response Length': 0,
                'Python Calls':    0,
                'Python Errors':   0,
            }

        local_tool      = None
        sandbox         = None
        python_calls    = 0
        python_errors   = 0
        total_tokens    = 0
        final_answer    = None
        logprobs_buffer = []

        attempt_seed        = int(math.pow(self.cfg.seed + attempt_index, 2))
        attempt_temperature = (
            temperature_override
            if temperature_override is not None
            else self.cfg.attempt_temperatures[0]
        )

        # Identify prompt label for display
        if system_prompt == self.cfg.system_prompt_verify:
            prompt_label = 'verify'
        elif system_prompt == self.cfg.system_prompt_alt:
            prompt_label = 'alt'
        else:
            prompt_label = 'standard'

        try:
            sandbox = self.sandbox_pool.get(timeout=self.cfg.sandbox_timeout)

            local_tool = AIMO3Tool(
                local_jupyter_timeout=self.cfg.jupyter_timeout,
                tool_prompt=self.cfg.tool_prompt,
                sandbox=sandbox
            )

            encoding     = self.encoding
            messages     = self.template.apply_chat_template(
                system_prompt,
                problem,
                local_tool.tool_config
            )
            conversation = Conversation.from_messages(messages)

            for _ in range(self.cfg.turns):
                if stop_event.is_set() or time.time() > deadline:
                    break

                prompt_ids = encoding.render_conversation_for_completion(
                    conversation, Role.ASSISTANT
                )
                max_tokens = self.cfg.context_tokens - len(prompt_ids)

                if max_tokens < self.cfg.buffer_tokens:
                    break

                stream = self.client.completions.create(
                    model=self.cfg.served_model_name,
                    temperature=attempt_temperature,
                    logprobs=self.cfg.top_logprobs,
                    max_tokens=max_tokens,
                    prompt=prompt_ids,
                    seed=attempt_seed,
                    stream=True,
                    extra_body={
                        'min_p':            self.cfg.min_p,
                        'stop_token_ids':   self.stop_token_ids,
                        'return_token_ids': True
                    }
                )

                try:
                    token_buffer = []
                    text_chunks  = []

                    for chunk in stream:
                        if stop_event.is_set() or time.time() > deadline:
                            break

                        new_tokens = chunk.choices[0].token_ids
                        new_text   = chunk.choices[0].text

                        if new_tokens:
                            token_buffer.extend(new_tokens)
                            total_tokens += len(new_tokens)
                            text_chunks.append(new_text)

                            chunk_logprobs = chunk.choices[0].logprobs
                            if chunk_logprobs is not None:
                                if chunk_logprobs.top_logprobs:
                                    logprobs_buffer.extend(
                                        chunk_logprobs.top_logprobs
                                    )

                        if '}' in new_text:
                            search_text = ''.join(
                                text_chunks[-self.cfg.search_tokens:]
                            )
                            answer = self._scan_for_answer(search_text)
                            if answer is not None:
                                final_answer = answer
                                break

                finally:
                    stream.close()

                if final_answer is not None:
                    break

                if not token_buffer:
                    break

                new_messages = encoding.parse_messages_from_completion_tokens(
                    token_buffer, Role.ASSISTANT
                )
                conversation.messages.extend(new_messages)
                last_message = new_messages[-1]

                if last_message.channel == 'final':
                    final_answer = self._scan_for_answer(
                        last_message.content[0].text
                    )
                    break

                if last_message.recipient == 'python':
                    python_calls  += 1
                    tool_responses = local_tool.process_sync_plus(last_message)
                    response_text  = tool_responses[0].content[0].text

                    if (
                        response_text.startswith('[ERROR]')
                        or 'Traceback' in response_text
                        or 'Error:' in response_text
                    ):
                        python_errors += 1

                    conversation.messages.extend(tool_responses)

        except Exception:
            python_errors += 1

        finally:
            if sandbox is not None:
                sandbox.reset()
                self.sandbox_pool.put(sandbox)

        return {
            'Attempt':         attempt_index,
            'Prompt':          prompt_label,
            'Temperature':     attempt_temperature,
            'Answer':          final_answer,
            'Entropy':         self._compute_mean_entropy(logprobs_buffer),
            'Response Length': total_tokens,
            'Python Calls':    python_calls,
            'Python Errors':   python_errors,
        }

    # ── Run one parallel batch of 9 attempts ──────────────────────────

    def _run_batch(
        self,
        user_input: str,
        batch_number: int,
        global_attempt_offset: int,
        stop_event: threading.Event,
        deadline: float,
        detailed_results: list,
        valid_answers: list,
        is_first_batch: bool,
    ) -> None:
        """
        Runs CFG.attempts attempts in parallel.
        Appends to shared detailed_results and valid_answers in place.
        Calls _has_converged after each result to decide early stop.
        """
        print(
            f'\n── Batch {batch_number} '
            f'(attempts {global_attempt_offset}–'
            f'{global_attempt_offset + self.cfg.attempts - 1}) ──'
        )

        executor      = ThreadPoolExecutor(max_workers=self.cfg.workers)
        batch_futures = []

        try:
            for position in range(self.cfg.attempts):
                attempt_index = global_attempt_offset + position
                temp          = self.cfg.attempt_temperatures[
                    position % len(self.cfg.attempt_temperatures)
                ]
                prompt        = self._pick_prompt(position)

                future = executor.submit(
                    self._process_attempt,
                    user_input,
                    prompt,
                    attempt_index,
                    stop_event,
                    deadline,
                    temp,
                )
                batch_futures.append(future)

            for future in as_completed(batch_futures):
                if stop_event.is_set():
                    break
                try:
                    result = future.result()
                    detailed_results.append(result)

                    if result['Answer'] is not None:
                        valid_answers.append(result['Answer'])

                    if self._has_converged(valid_answers, is_first_batch):
                        top_count, second_count, lead = self._compute_lead(valid_answers)
                        print(
                            f'  Early stop — '
                            f'total_valid={len(valid_answers)}, '
                            f'top={top_count}, second={second_count}, lead={lead}'
                        )
                        stop_event.set()
                        break

                except Exception as exc:
                    print(f'  Attempt failed: {exc}')

        finally:
            for f in batch_futures:
                f.cancel()
            executor.shutdown(wait=True, cancel_futures=True)

    # ── Answer selection ──────────────────────────────────────────────

    def _select_answer(self, detailed_results: list) -> int:

        answer_weights = defaultdict(float)
        answer_votes   = defaultdict(int)

        for result in detailed_results:
            answer  = result['Answer']
            entropy = result['Entropy']
            if answer is not None:
                weight = 1.0 / max(entropy, 1e-9)
                answer_weights[answer] += weight
                answer_votes[answer]   += 1

        scored_answers = sorted(
            [
                {'answer': ans, 'votes': answer_votes[ans], 'score': score}
                for ans, score in answer_weights.items()
            ],
            key=lambda x: (x['votes'], x['score']),
            reverse=True
        )

        vote_df = pd.DataFrame(
            [
                (s['answer'], s['votes'], round(s['score'], 3))
                for s in scored_answers
            ],
            columns=['Answer', 'Votes', 'Score']
        )
        display(vote_df)

        if not scored_answers:
            print('\nFinal Answer: 0\n')
            return 0

        final_answer = scored_answers[0]['answer']
        print(f'\nFinal Answer: {final_answer}\n')
        return final_answer

    # ── Display combined results table ────────────────────────────────

    def _display_results(self, detailed_results: list) -> None:
        if not detailed_results:
            return
        results_df = pd.DataFrame(detailed_results)
        results_df['Entropy']     = results_df['Entropy'].round(3)
        results_df['Answer']      = results_df['Answer'].astype('Int64')
        results_df['Temperature'] = results_df['Temperature'].astype(float).round(1)
        cols = [
            'Attempt', 'Prompt', 'Temperature', 'Answer', 'Entropy',
            'Response Length', 'Python Calls', 'Python Errors'
        ]
        cols = [c for c in cols if c in results_df.columns]
        display(results_df[cols])

    # ── Main solve loop ───────────────────────────────────────────────

    def solve_problem(self, problem: str) -> int:

        print(f'\nProblem: {problem}\n')
        user_input = f'{problem} {self.cfg.preference_prompt}'

        # ── Base budget (same formula as original) ────────────────────
        elapsed_global       = time.time() - self.notebook_start_time
        time_left            = self.cfg.notebook_limit - elapsed_global
        problems_left_others = max(0, self.problems_remaining - 1)
        reserved_time        = problems_left_others * self.cfg.base_problem_timeout

        base_budget = time_left - reserved_time
        base_budget = min(base_budget, self.cfg.high_problem_timeout)
        base_budget = max(base_budget, self.cfg.base_problem_timeout)

        # ── Extended budget: 4x base, capped at 4x high_problem_timeout
        extended_budget = min(
            base_budget * self.cfg.budget_multiplier,
            self.cfg.extended_timeout_cap
        )
        deadline = time.time() + extended_budget

        print(
            f'Base budget: {base_budget:.0f}s | '
            f'Extended budget: {extended_budget:.0f}s | '
            f'Deadline: {deadline:.2f}\n'
        )

        # ── Accumulate across all batches ─────────────────────────────
        detailed_results      = []
        valid_answers         = []
        global_attempt_offset = 0
        batch_number          = 1

        while True:

            time_remaining = deadline - time.time()

            if time_remaining < 30:
                print(
                    f'Only {time_remaining:.0f}s left — '
                    f'not enough time for batch {batch_number}, stopping.'
                )
                break

            is_first_batch = (batch_number == 1)
            stop_event     = threading.Event()

            self._run_batch(
                user_input=user_input,
                batch_number=batch_number,
                global_attempt_offset=global_attempt_offset,
                stop_event=stop_event,
                deadline=deadline,
                detailed_results=detailed_results,
                valid_answers=valid_answers,
                is_first_batch=is_first_batch,
            )

            global_attempt_offset += self.cfg.attempts
            batch_number          += 1

            top_count, second_count, lead = self._compute_lead(valid_answers)

            print(
                f'\nAfter batch {batch_number - 1}: '
                f'distribution={Counter(valid_answers).most_common(5)} | '
                f'top={top_count} | second={second_count} | lead={lead} | '
                f'total_valid={len(valid_answers)}'
            )

            # While loop exit uses the later-batch threshold
            # (is_first_batch=False) since we check after the batch completes
            if self._has_converged(valid_answers, is_first_batch=False):
                print(f'Converged after batch {batch_number - 1}. Stopping.')
                break

            time_remaining = deadline - time.time()
            print(
                f'Not yet converged — '
                f'running batch {batch_number} '
                f'({time_remaining:.0f}s remaining)'
            )

        # ── Decrement once, after all batches ─────────────────────────
        self.problems_remaining = max(0, self.problems_remaining - 1)

        # ── Display combined table and select answer ──────────────────
        total_batches = batch_number - 1
        print(f'\n{"─" * 60}')
        print(
            f'Total: {len(detailed_results)} attempts '
            f'across {total_batches} batch(es)'
        )
        print(f'{"─" * 60}')

        self._display_results(detailed_results)

        if not valid_answers:
            print('\nResult: 0\n')
            return 0

        return self._select_answer(detailed_results)

    def __del__(self):
        if hasattr(self, 'server_process'):
            self.server_process.terminate()
            self.server_process.wait()
        if hasattr(self, 'log_file'):
            self.log_file.close()
        if hasattr(self, 'sandbox_pool'):
            while not self.sandbox_pool.empty():
                try:
                    sb = self.sandbox_pool.get_nowait()
                    sb.close()
                except Exception:
                    pass

solver = AIMO3Solver(CFG)

Loading model weights from /kaggle/input/gpt-oss-120b/transformers/default/1 into OS Page Cache...
Processed 26 files (65.28 GB) in 72.94 seconds.

Waiting for vLLM server...
Server is ready (took 126.08 seconds).

Initializing 16 persistent Jupyter kernels...
Kernels initialized in 2.89 seconds.

def predict(id_: pl.DataFrame, question: pl.DataFrame, answer: Optional[pl.DataFrame] = None) -> pl.DataFrame:
    global correct_count, total_count, predictions
    
    id_value = id_.item(0)
    question_text = question.item(0)

    gc.disable()

    print("------")
    print(f"ID: {id_value}")
    print(f"Question: {question_text[:200]}...")

    final_answer = solver.solve_problem(question_text)
    predictions[id_value] = final_answer

    gc.enable()
    gc.collect()

    total_count += 1
    if id_value in ground_truth:
        gt = ground_truth[id_value]
        is_correct = (final_answer == gt)
        if is_correct:
            correct_count += 1
        status = "✅" if is_correct else "❌"
        print(f"Answer: {final_answer} | Ground Truth: {gt} | {status}")
        print(f"📊 Running Accuracy: {correct_count}/{total_count} ({100*correct_count/total_count:.1f}%)")
    else:
        print(f"Answer: {final_answer}")

    print("------\n")

    return pl.DataFrame({'id': id_value, 'answer': final_answer})



# Load reference data and keep ground truth for accuracy calculation
df = pd.read_csv(
    "/kaggle/input/ai-mathematical-olympiad-progress-prize-3/reference.csv"
)

# Store ground truth answers for accuracy calculation (only in local mode)
ground_truth = dict(zip(df["id"], df["answer"])) if "answer" in df.columns else {}

# Drop id and answer columns
df = df.drop(columns=["answer"], errors="ignore")

# Create input file without answers
df.to_csv("ref.csv", index=False)

# Track predictions for accuracy calculation
predictions = {}
correct_count = 0
total_count = 0


# df = df[df['id'].isin(['dd7f5e', '86e8e5'])]
# df = df[df['id'].isin(['86e8e5'])]
# df = pd.concat([df]*10, ignore_index=True)
df.to_csv("ref.csv", index=False)

print(df)

       id                                            problem
0  0e644e  Let $ABC$ be an acute-angled triangle with int...
1  26de63  Define a function $f \colon \mathbb{Z}_{\geq 1...
2  424e18  A tournament is held with $2^{20}$ runners eac...
3  42d360  On a blackboard, Ken starts off by writing a p...
4  641659  Let $ABC$ be a triangle with $AB \neq AC$, cir...
5  86e8e5  Let $n \geq 6$ be a positive integer. We call ...
6  92ba6a  Alice and Bob are each holding some integer nu...
7  9c1c5f  Let $f \colon \mathbb{Z}_{\geq 1} \to \mathbb{...
8  a295e9  A $500 \times 500$ square is divided into $k$ ...
9  dd7f5e  Let $\mathcal{F}$ be the set of functions $\al...

inference_server = kaggle_evaluation.aimo_3_inference_server.AIMO3InferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
    
else:
    inference_server.run_local_gateway(
        # ('/kaggle/input/ai-mathematical-olympiad-progress-prize-3/test.csv',)
        ('/kaggle/working/ref.csv',)
    )

------
ID: 9c1c5f
Question: Let $f \colon \mathbb{Z}_{\geq 1} \to \mathbb{Z}_{\geq 1}$ be a function such that for all positive integers $m$ and $n$, 
\begin{equation*}
    f(m) + f(n) = f(m + n + mn).
\end{equation*}
Across all...

Problem: Let $f \colon \mathbb{Z}_{\geq 1} \to \mathbb{Z}_{\geq 1}$ be a function such that for all positive integers $m$ and $n$, 
\begin{equation*}
    f(m) + f(n) = f(m + n + mn).
\end{equation*}
Across all functions $f$ such that $f(n) \leq 1000$ for all $n \leq 1000$, how many different values can $f(2024)$ take?

Base budget: 900s | Extended budget: 2700s | Deadline: 1774625340.03


── Batch 1 (attempts 0–8) ──
  Early stop — total_valid=6, top=6, second=0, lead=6

After batch 1: distribution=[(580, 6)] | top=6 | second=0 | lead=6 | total_valid=6
Converged after batch 1. Stopping.

────────────────────────────────────────────────────────────
Total: 6 attempts across 1 batch(es)
────────────────────────────────────────────────────────────

Final Answer: 580

Answer: 580 | Ground Truth: 580 | ✅
📊 Running Accuracy: 1/1 (100.0%)
------

------
ID: 86e8e5
Question: Let $n \geq 6$ be a positive integer. We call a positive integer $n$-Norwegian if it has three distinct positive divisors whose sum is equal to $n$. Let $f(n)$ denote the smallest $n$-Norwegian positi...

Problem: Let $n \geq 6$ be a positive integer. We call a positive integer $n$-Norwegian if it has three distinct positive divisors whose sum is equal to $n$. Let $f(n)$ denote the smallest $n$-Norwegian positive integer. Let $M=3^{2025!}$ and for a non-negative integer $c$ define 
\begin{equation*}
    g(c)=\frac{1}{2025!}\left\lfloor \frac{2025! f(M+c)}{M}\right\rfloor.
\end{equation*}
We can write 
\begin{equation*}
    g(0)+g(4M)+g(1848374)+g(10162574)+g(265710644)+g(44636594)=\frac{p}{q}
\end{equation*}
where $p$ and $q$ are coprime positive integers. What is the remainder when $p+q$ is divided by $99991$?

Base budget: 900s | Extended budget: 2700s | Deadline: 1774625488.49


── Batch 1 (attempts 0–8) ──

After batch 1: distribution=[(51379, 1), (23, 1), (40614, 1), (85452, 1), (87208, 1)] | top=1 | second=1 | lead=0 | total_valid=7
Not yet converged — running batch 2 (2081s remaining)

── Batch 2 (attempts 9–17) ──

After batch 2: distribution=[(54680, 2), (96985, 2), (8687, 2), (51379, 1), (23, 1)] | top=2 | second=2 | lead=0 | total_valid=15
Not yet converged — running batch 3 (1499s remaining)

── Batch 3 (attempts 18–26) ──

After batch 3: distribution=[(23, 2), (54680, 2), (96985, 2), (8687, 2), (6825, 2)] | top=2 | second=2 | lead=0 | total_valid=21
Not yet converged — running batch 4 (737s remaining)

── Batch 4 (attempts 27–35) ──

After batch 4: distribution=[(8687, 4), (98449, 3), (23, 2), (54680, 2), (96985, 2)] | top=4 | second=3 | lead=1 | total_valid=27
Not yet converged — running batch 5 (-0s remaining)
Only -0s left — not enough time for batch 5, stopping.

────────────────────────────────────────────────────────────
Total: 36 attempts across 4 batch(es)
────────────────────────────────────────────────────────────

Final Answer: 8687

Answer: 8687 | Ground Truth: 8687 | ✅
📊 Running Accuracy: 2/2 (100.0%)
------

------
ID: 42d360
Question: On a blackboard, Ken starts off by writing a positive integer $n$ and then applies the following move until he first reaches $1$. Given that the number on the board is $m$, he chooses a base $b$, wher...

Problem: On a blackboard, Ken starts off by writing a positive integer $n$ and then applies the following move until he first reaches $1$. Given that the number on the board is $m$, he chooses a base $b$, where $2 \leq b \leq m$, and considers the unique base-$b$ representation of $m$,
\begin{equation*}
    m = \sum_{k = 0}^\infty a_k \cdot b^k
\end{equation*}
where $a_k$ are non-negative integers and $0 \leq a_k < b$ for each $k$. Ken then erases $m$ on the blackboard and replaces it with $\sum\limits_{k = 0}^\infty a_k$.

Across all choices of $1 \leq n \leq 10^{10^5}$, the largest possible number of moves Ken could make is $M$. What is the remainder when $M$ is divided by $10^{5}$?

Base budget: 450s | Extended budget: 1349s | Deadline: 1774626838.48


── Batch 1 (attempts 0–8) ──
  Early stop — total_valid=6, top=6, second=0, lead=6

After batch 1: distribution=[(32193, 6)] | top=6 | second=0 | lead=6 | total_valid=6
Converged after batch 1. Stopping.

────────────────────────────────────────────────────────────
Total: 6 attempts across 1 batch(es)
────────────────────────────────────────────────────────────

Final Answer: 32193

Answer: 32193 | Ground Truth: 32193 | ✅
📊 Running Accuracy: 3/3 (100.0%)
------

------
ID: 26de63
Question: Define a function $f \colon \mathbb{Z}_{\geq 1} \to \mathbb{Z}_{\geq 1}$ by
\begin{equation*}
    f(n) = \sum_{i = 1}^n \sum_{j = 1}^n j^{1024} \left\lfloor\frac1j + \frac{n-i}{n}\right\rfloor.
\end{e...

Problem: Define a function $f \colon \mathbb{Z}_{\geq 1} \to \mathbb{Z}_{\geq 1}$ by
\begin{equation*}
    f(n) = \sum_{i = 1}^n \sum_{j = 1}^n j^{1024} \left\lfloor\frac1j + \frac{n-i}{n}\right\rfloor.
\end{equation*}
Let $M=2 \cdot 3 \cdot 5 \cdot 7 \cdot 11 \cdot 13$ and let $N = f{\left(M^{15}\right)} - f{\left(M^{15}-1\right)}$. Let $k$ be the largest non-negative integer such that $2^k$ divides $N$. What is the remainder when $2^k$ is divided by $5^7$?

Base budget: 639s | Extended budget: 1916s | Deadline: 1774627516.52


── Batch 1 (attempts 0–8) ──
  Early stop — total_valid=6, top=6, second=0, lead=6

After batch 1: distribution=[(32951, 6)] | top=6 | second=0 | lead=6 | total_valid=6
Converged after batch 1. Stopping.

────────────────────────────────────────────────────────────
Total: 6 attempts across 1 batch(es)
────────────────────────────────────────────────────────────

Final Answer: 32951

Answer: 32951 | Ground Truth: 32951 | ✅
📊 Running Accuracy: 4/4 (100.0%)
------

------
ID: dd7f5e
Question: Let $\mathcal{F}$ be the set of functions $\alpha \colon \mathbb{Z}\to \mathbb{Z}$ for which there are only finitely many $n \in \mathbb{Z}$ such that $\alpha(n) \neq 0$. 

For two functions $\alpha$ ...

Problem: Let $\mathcal{F}$ be the set of functions $\alpha \colon \mathbb{Z}\to \mathbb{Z}$ for which there are only finitely many $n \in \mathbb{Z}$ such that $\alpha(n) \neq 0$. 

For two functions $\alpha$ and $\beta$ in $\mathcal{F}$, define their product $\alpha\star\beta$ to be $\sum\limits_{n\in\mathbb{Z}} \alpha(n)\cdot \beta(n)$. Also, for $n\in\mathbb{Z}$, define a shift operator $S_n \colon \mathcal{F}\to \mathcal{F}$ by $S_n(\alpha)(t)=\alpha(t+n)$ for all $t \in \mathbb{Z}$.

A function $\alpha \in \mathcal{F}$ is called \emph{shifty} if 
\begin{itemize}
    \item $\alpha(m)=0$ for all integers $m<0$ and $m>8$ and
    \item There exists $\beta \in \mathcal{F}$ and integers $k \neq l$ such that for all $n \in \mathbb{Z}$
    \begin{equation*}
        S_n(\alpha)\star\beta =
        \begin{cases}
            1 & n \in \{k,l\} \\
            0 & n \not \in \{k,l\}
        \end{cases}
        \; .
    \end{equation*}
\end{itemize}
How many shifty functions are there in $\mathcal{F}$?

Base budget: 841s | Extended budget: 2522s | Deadline: 1774628220.51


── Batch 1 (attempts 0–8) ──
  Early stop — total_valid=8, top=5, second=2, lead=3

After batch 1: distribution=[(160, 5), (266, 2), (44, 1)] | top=5 | second=2 | lead=3 | total_valid=8
Converged after batch 1. Stopping.

────────────────────────────────────────────────────────────
Total: 8 attempts across 1 batch(es)
────────────────────────────────────────────────────────────

Final Answer: 160

Answer: 160 | Ground Truth: 160 | ✅
📊 Running Accuracy: 5/5 (100.0%)
------

------
ID: 92ba6a
Question: Alice and Bob are each holding some integer number of sweets. Alice says to Bob: ``If we each added the number of sweets we're holding to our (positive integer) age, my answer would be double yours. I...

Problem: Alice and Bob are each holding some integer number of sweets. Alice says to Bob: ``If we each added the number of sweets we're holding to our (positive integer) age, my answer would be double yours. If we took the product, then my answer would be four times yours.'' Bob replies: ``Why don't you give me five of your sweets because then both our sum and product would be equal.'' What is the product of Alice and Bob's ages?

Base budget: 864s | Extended budget: 2591s | Deadline: 1774628566.92


── Batch 1 (attempts 0–8) ──
  Early stop — total_valid=6, top=6, second=0, lead=6

After batch 1: distribution=[(50, 6)] | top=6 | second=0 | lead=6 | total_valid=6
Converged after batch 1. Stopping.

────────────────────────────────────────────────────────────
Total: 6 attempts across 1 batch(es)
────────────────────────────────────────────────────────────

Final Answer: 50

Answer: 50 | Ground Truth: 50 | ✅
📊 Running Accuracy: 6/6 (100.0%)
------

------
ID: 424e18
Question: A tournament is held with $2^{20}$ runners each of which has a different running speed. In each race, two runners compete against each other with the faster runner always winning the race. The competi...

Problem: A tournament is held with $2^{20}$ runners each of which has a different running speed. In each race, two runners compete against each other with the faster runner always winning the race. The competition consists of $20$ rounds with each runner starting with a score of $0$. In each round, the runners are paired in such a way that in each pair, both runners have the same score at the beginning of the round. The winner of each race in the $i^{\text{th}}$ round receives $2^{20-i}$ points and the loser gets no points.

At the end of the tournament, we rank the competitors according to their scores. Let $N$ denote the number of possible orderings of the competitors at the end of the tournament. Let $k$ be the largest positive integer such that $10^k$ divides $N$. What is the remainder when $k$ is divided by $10^{5}$?

Base budget: 900s | Extended budget: 2700s | Deadline: 1774628714.22


── Batch 1 (attempts 0–8) ──

After batch 1: distribution=[(21818, 5), (62140, 4)] | top=5 | second=4 | lead=1 | total_valid=9
Not yet converged — running batch 2 (2415s remaining)

── Batch 2 (attempts 9–17) ──
  Early stop — total_valid=10, top=6, second=4, lead=2

After batch 2: distribution=[(21818, 6), (62140, 4)] | top=6 | second=4 | lead=2 | total_valid=10
Converged after batch 2. Stopping.

────────────────────────────────────────────────────────────
Total: 10 attempts across 2 batch(es)
────────────────────────────────────────────────────────────

Final Answer: 21818

Answer: 21818 | Ground Truth: 21818 | ✅
📊 Running Accuracy: 7/7 (100.0%)
------

------
ID: 0e644e
Question: Let $ABC$ be an acute-angled triangle with integer side lengths and $AB<AC$. Points $D$ and $E$ lie on segments $BC$ and $AC$, respectively, such that $AD=AE=AB$. Line $DE$ intersects $AB$ at $X$. Cir...

Problem: Let $ABC$ be an acute-angled triangle with integer side lengths and $AB<AC$. Points $D$ and $E$ lie on segments $BC$ and $AC$, respectively, such that $AD=AE=AB$. Line $DE$ intersects $AB$ at $X$. Circles $BXD$ and $CED$ intersect for the second time at $Y \neq D$. Suppose that $Y$ lies on line $AD$. There is a unique such triangle with minimal perimeter. This triangle has side lengths $a=BC$, $b=CA$, and $c=AB$. Find the remainder when $abc$ is divided by $10^{5}$.

Base budget: 900s | Extended budget: 2700s | Deadline: 1774629153.98


── Batch 1 (attempts 0–8) ──
  Early stop — total_valid=6, top=6, second=0, lead=6

After batch 1: distribution=[(336, 6)] | top=6 | second=0 | lead=6 | total_valid=6
Converged after batch 1. Stopping.

────────────────────────────────────────────────────────────
Total: 6 attempts across 1 batch(es)
────────────────────────────────────────────────────────────

Final Answer: 336

Answer: 336 | Ground Truth: 336 | ✅
📊 Running Accuracy: 8/8 (100.0%)
------

------
ID: a295e9
Question: A $500 \times 500$ square is divided into $k$ rectangles, each having integer side lengths. Given that no two of these rectangles have the same perimeter, the largest possible value of $k$ is $\mathca...

Problem: A $500 \times 500$ square is divided into $k$ rectangles, each having integer side lengths. Given that no two of these rectangles have the same perimeter, the largest possible value of $k$ is $\mathcal{K}$. What is the remainder when $k$ is divided by $10^{5}$?

Base budget: 900s | Extended budget: 2700s | Deadline: 1774629407.88


── Batch 1 (attempts 0–8) ──
  Early stop — total_valid=6, top=5, second=1, lead=4

After batch 1: distribution=[(520, 5), (186, 1)] | top=5 | second=1 | lead=4 | total_valid=6
Converged after batch 1. Stopping.

────────────────────────────────────────────────────────────
Total: 6 attempts across 1 batch(es)
────────────────────────────────────────────────────────────

Final Answer: 520

Answer: 520 | Ground Truth: 520 | ✅
📊 Running Accuracy: 9/9 (100.0%)
------

------
ID: 641659
Question: Let $ABC$ be a triangle with $AB \neq AC$, circumcircle $\Omega$, and incircle $\omega$. Let the contact points of $\omega$ with $BC$, $CA$, and $AB$ be $D$, $E$, and $F$, respectively. Let the circum...

Problem: Let $ABC$ be a triangle with $AB \neq AC$, circumcircle $\Omega$, and incircle $\omega$. Let the contact points of $\omega$ with $BC$, $CA$, and $AB$ be $D$, $E$, and $F$, respectively. Let the circumcircle of $AFE$ meet $\Omega$ at $K$ and let the reflection of $K$ in $EF$ be $K'$. Let $N$ denote the foot of the perpendicular from $D$ to $EF$. The circle tangent to line $BN$ and passing through $B$ and $K$ intersects $BC$ again at $T \neq B$. 
    
Let sequence $(F_n)_{n \geq 0}$ be defined by $F_0 = 0$, $F_1 = 1$ and for $n \geq 2$, $F_n = F_{n-1} + F_{n-2}$. Call $ABC$ $n$\emph{-tastic} if $BD = F_n$, $CD = F_{n+1}$, and $KNK'B$ is cyclic. Across all $n$-tastic triangles, let $a_n$ denote the maximum possible value of $\frac{CT \cdot NB}{BT \cdot NE}$. Let $\alpha$ denote the smallest real number such that for all sufficiently large $n$, $a_{2n} < \alpha$. Given that $\alpha = p + \sqrt{q}$ for rationals $p$ and $q$, what is the remainder when $\left\lfloor p^{q^p} \right\rfloor$ is divided by $99991$?

Base budget: 900s | Extended budget: 2700s | Deadline: 1774629674.61


── Batch 1 (attempts 0–8) ──
  Early stop — total_valid=6, top=5, second=1, lead=4

After batch 1: distribution=[(57447, 5), (1, 1)] | top=5 | second=1 | lead=4 | total_valid=6
Converged after batch 1. Stopping.

────────────────────────────────────────────────────────────
Total: 6 attempts across 1 batch(es)
────────────────────────────────────────────────────────────

	Answer	Votes	Score
0	8687	4	6.534
1	98449	3	4.872
2	54680	2	3.559
3	6825	2	3.250
4	96985	2	3.219
5	23	2	2.895
6	82321	1	1.780
7	85452	1	1.756
8	13657	1	1.754
9	64836	1	1.730
10	1233	1	1.718
11	92200	1	1.661
12	1035	1	1.626
13	40958	1	1.622
14	51379	1	1.614
15	40614	1	1.599
16	77463	1	1.573
17	87208	1	1.572

	Attempt	Prompt	Temperature	Answer	Entropy	Response Length	Python Calls	Python Errors
0	3	verify	0.2	580	0.594	7046	5	1
1	5	verify	0.6	580	0.587	7056	6	2
2	4	verify	0.6	580	0.738	8201	11	1
3	2	standard	0.2	580	0.652	9057	7	1
4	8	alt	0.6	580	0.743	10728	12	0
5	6	alt	0.6	580	0.739	11047	11	1

	Attempt	Prompt	Temperature	Answer	Entropy	Response Length	Python Calls	Python Errors
0	0	standard	0.2	51379	0.620	19516	19	3
1	8	alt	0.6	23	0.686	30998	21	1
2	6	alt	0.6	40614	0.625	28399	37	8
3	3	verify	0.2	85452	0.570	28575	59	7
4	7	alt	0.6	<NA>	0.648	32779	35	3
5	1	standard	0.2	87208	0.636	34867	32	1
6	5	verify	0.6	<NA>	0.643	48952	63	4
7	2	standard	0.2	54680	0.537	45404	86	7
8	4	verify	0.6	1035	0.615	54424	47	2
9	16	alt	0.6	96985	0.660	23482	16	0
10	15	alt	0.6	13657	0.570	26359	39	4
11	9	standard	0.2	77463	0.636	33965	48	2
12	11	standard	0.2	8687	0.612	33023	34	5
13	13	verify	0.6	8687	0.631	34817	51	2
14	14	verify	0.6	<NA>	0.641	39611	30	3
15	12	verify	0.2	54680	0.589	37003	39	7
16	10	standard	0.2	96985	0.587	41373	56	4
17	17	alt	0.6	1233	0.582	45585	48	4
18	23	verify	0.6	6825	0.647	28463	37	2
19	22	verify	0.6	23	0.696	32131	16	1
20	26	alt	0.6	6825	0.587	33489	45	1
21	25	alt	0.6	92200	0.602	46552	62	5
22	20	standard	0.2	40958	0.617	39302	77	25
23	21	verify	0.2	64836	0.578	47921	88	8
24	19	standard	0.2	<NA>	0.560	48494	53	3
25	24	alt	0.6	<NA>	0.638	50776	115	2
26	18	standard	0.2	<NA>	0.565	59170	47	11
27	33	alt	0.6	98449	0.654	40707	42	4
28	27	standard	0.2	98449	0.605	46205	52	3
29	32	verify	0.6	98449	0.592	42522	68	15
30	34	alt	0.6	8687	0.625	46821	80	6
31	29	standard	0.2	8687	0.583	46392	61	7
32	28	standard	0.2	82321	0.562	48037	72	7
33	31	verify	0.6	<NA>	0.630	52173	38	3
34	35	alt	0.6	<NA>	0.599	52635	46	4
35	30	verify	0.2	<NA>	0.602	47787	78	4

	Attempt	Prompt	Temperature	Answer	Entropy	Response Length	Python Calls	Python Errors
0	4	verify	0.6	32193	0.763	5379	4	0
1	3	verify	0.2	32193	0.616	6170	8	1
2	1	standard	0.2	32193	0.667	6225	5	0
3	6	alt	0.6	32193	0.661	7260	14	0
4	0	standard	0.2	32193	0.592	8507	9	0
5	8	alt	0.6	32193	0.665	9557	16	3

	Attempt	Prompt	Temperature	Answer	Entropy	Response Length	Python Calls	Python Errors
0	3	verify	0.2	32951	0.511	4741	2	1
1	0	standard	0.2	32951	0.552	5145	5	0
2	1	standard	0.2	32951	0.548	5695	7	2
3	2	standard	0.2	32951	0.481	7076	4	1
4	5	verify	0.6	32951	0.523	8004	6	0
5	6	alt	0.6	32951	0.519	7822	14	2

	Attempt	Prompt	Temperature	Answer	Entropy	Response Length	Python Calls	Python Errors
0	2	standard	0.2	160	0.738	13524	15	1
1	7	alt	0.6	160	0.769	15564	8	1
2	5	verify	0.6	44	0.684	18182	28	2
3	8	alt	0.6	160	0.678	18946	16	1
4	1	standard	0.2	266	0.672	19320	22	5
5	3	verify	0.2	266	0.573	18345	18	7
6	6	alt	0.6	160	0.679	20395	30	6
7	0	standard	0.2	160	0.658	22441	33	1

	Attempt	Prompt	Temperature	Answer	Entropy	Response Length	Python Calls
0	8	alt	0.6	50	0.495	1772	2
1	2	standard	0.2	50	0.554	1835	1
2	0	standard	0.2	50	0.444	2646	1
3	5	verify	0.6	50	0.406	2642	2
4	7	alt	0.6	50	0.655	2910	1
5	4	verify	0.6	50	0.521	3092	2

	Attempt	Prompt	Temperature	Answer	Entropy	Response Length	Python Calls	Python Errors
0	5	verify	0.6	62140	0.842	7942	3	0
1	8	alt	0.6	62140	0.901	8546	3	0
2	4	verify	0.6	21818	0.768	10854	7	0
3	1	standard	0.2	21818	0.830	12850	5	0
4	0	standard	0.2	21818	0.700	13887	18	1
5	2	standard	0.2	62140	0.843	16277	4	1
6	7	alt	0.6	21818	0.786	16722	11	0
7	3	verify	0.2	62140	0.732	23759	25	0
8	6	alt	0.6	21818	0.686	30257	27	2
9	11	standard	0.2	21818	0.716	11066	9	1

	Attempt	Prompt	Temperature	Answer	Entropy	Response Length	Python Calls	Python Errors
0	6	alt	0.6	336	0.569	11819	18	2
1	2	standard	0.2	336	0.606	13297	12	1
2	7	alt	0.6	336	0.513	12986	23	2
3	8	alt	0.6	336	0.516	14895	17	1
4	4	verify	0.6	336	0.551	15454	15	3
5	5	verify	0.6	336	0.503	21184	25	6

	Attempt	Prompt	Temperature	Answer	Entropy	Response Length	Python Calls
0	8	alt	0.6	520	0.912	10020	6
1	2	standard	0.2	520	0.901	12290	5
2	3	verify	0.2	520	0.820	15301	8
3	0	standard	0.2	186	0.779	17588	3
4	4	verify	0.6	520	0.882	20225	15
5	1	standard	0.2	520	0.859	23439	9

	Attempt	Prompt	Temperature	Answer	Entropy	Response Length	Python Calls	Python Errors
0	5	verify	0.6	57447	0.486	13115	6	1
1	3	verify	0.2	57447	0.325	17483	32	9
2	0	standard	0.2	57447	0.501	22950	45	5
3	4	verify	0.6	57447	0.412	24048	32	5
4	7	alt	0.6	57447	0.479	23111	17	6
5	2	standard	0.2	1	0.537	25198	54	6