diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h index e473110eca7415..5d1f44988a6df1 100644 --- a/Include/cpython/pystats.h +++ b/Include/cpython/pystats.h @@ -144,6 +144,7 @@ typedef struct _optimization_stats { uint64_t unknown_callee; uint64_t trace_immediately_deopts; uint64_t executors_invalidated; + uint64_t fitness_terminated_traces; UOpStats opcode[PYSTATS_MAX_UOP_ID + 1]; uint64_t unsupported_opcode[256]; uint64_t trace_length_hist[_Py_UOP_HIST_SIZE]; diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index c4b084642668a9..dec587563f0eb5 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -449,6 +449,9 @@ typedef struct _PyOptimizationConfig { uint16_t side_exit_initial_value; uint16_t side_exit_initial_backoff; + // Trace fitness thresholds + uint16_t fitness_initial; + // Optimization flags bool specialization_enabled; bool uops_optimize_enabled; diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index cf01c620476ff7..f3b77ce000a35f 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -15,6 +15,51 @@ extern "C" { #include "pycore_optimizer_types.h" #include +/* Fitness controls how long a trace can grow. + * Starts at FITNESS_INITIAL, then decreases from per-bytecode buffer usage + * plus branch/frame heuristics. The trace stops when fitness drops below the + * current exit_quality. + * + * Design targets for the constants below: + * 1. Reaching the abstract frame-depth limit should drop fitness below + * EXIT_QUALITY_SPECIALIZABLE. + * 2. A backward edge should leave budget for roughly N_BACKWARD_SLACK more + * bytecodes, assuming AVG_SLOTS_PER_INSTRUCTION. + * 3. Roughly seven balanced branches should reduce fitness to + * EXIT_QUALITY_DEFAULT before per-slot costs. + * 4. A push followed by a matching return is net-zero on frame-specific + * fitness, excluding per-slot costs. + */ +#define MAX_TARGET_LENGTH 400 +#define OPTIMIZER_EFFECTIVENESS 2 +#define FITNESS_INITIAL (MAX_TARGET_LENGTH * OPTIMIZER_EFFECTIVENESS) + +/* Exit quality thresholds: trace stops when fitness < exit_quality. + * Higher = trace is more willing to stop here. */ +#define EXIT_QUALITY_CLOSE_LOOP (FITNESS_INITIAL / 2) +#define EXIT_QUALITY_ENTER_EXECUTOR (FITNESS_INITIAL * 3 / 8) +#define EXIT_QUALITY_DEFAULT (FITNESS_INITIAL / 8) +#define EXIT_QUALITY_SPECIALIZABLE (FITNESS_INITIAL / 80) + +/* Estimated buffer slots per bytecode, used only to derive heuristics. + * Runtime charging uses trace-buffer capacity consumed for each bytecode. */ +#define AVG_SLOTS_PER_INSTRUCTION 6 + +/* Heuristic backward-edge penalty: leave room for about + * N_BACKWARD_SLACK more bytecodes before reaching EXIT_QUALITY_CLOSE_LOOP, + * based on AVG_SLOTS_PER_INSTRUCTION. */ +#define N_BACKWARD_SLACK 50 +#define FITNESS_BACKWARD_EDGE (FITNESS_INITIAL - EXIT_QUALITY_CLOSE_LOOP \ + - N_BACKWARD_SLACK * AVG_SLOTS_PER_INSTRUCTION) + +/* Backward edge penalty for JUMP_BACKWARD_NO_INTERRUPT (coroutines/yield-from). + * Smaller than FITNESS_BACKWARD_EDGE since these loops are very short. */ +#define FITNESS_BACKWARD_EDGE_COROUTINE (FITNESS_BACKWARD_EDGE / 4) + +/* Penalty for a perfectly balanced (50/50) branch. + * 7 such branches (ignoring per-slot cost) exhaust fitness to EXIT_QUALITY_DEFAULT. */ +#define FITNESS_BRANCH_BALANCED ((FITNESS_INITIAL - EXIT_QUALITY_DEFAULT) / 7) + typedef struct _PyJitUopBuffer { _PyUOpInstruction *start; @@ -101,7 +146,8 @@ typedef struct _PyJitTracerPreviousState { } _PyJitTracerPreviousState; typedef struct _PyJitTracerTranslatorState { - int jump_backward_seen; + int32_t fitness; // Current trace fitness, starts high, decrements + int frame_depth; // Current inline depth (0 = root frame) } _PyJitTracerTranslatorState; typedef struct _PyJitTracerState { diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index 8acc5eedad58f8..4fc83383e57a15 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -1358,9 +1358,13 @@ def testfunc(n): for _ in gen(n): pass testfunc(TIER2_THRESHOLD * 2) + # The generator may be inlined into testfunc's trace, + # so check whichever executor contains _YIELD_VALUE. gen_ex = get_first_executor(gen) - self.assertIsNotNone(gen_ex) - uops = get_opnames(gen_ex) + testfunc_ex = get_first_executor(testfunc) + ex = gen_ex or testfunc_ex + self.assertIsNotNone(ex) + uops = get_opnames(ex) self.assertNotIn("_MAKE_HEAP_SAFE", uops) self.assertIn("_YIELD_VALUE", uops) diff --git a/Modules/_testinternalcapi/test_cases.c.h b/Modules/_testinternalcapi/test_cases.c.h index c5e16ce373037a..3b2469bac8ba57 100644 --- a/Modules/_testinternalcapi/test_cases.c.h +++ b/Modules/_testinternalcapi/test_cases.c.h @@ -12239,7 +12239,13 @@ tracer->prev_state.instr_frame = frame; tracer->prev_state.instr_oparg = oparg; tracer->prev_state.instr_stacklevel = PyStackRef_IsNone(frame->f_executable) ? 2 : STACK_LEVEL(); - if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]]) { + if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]] + // Branch opcodes use the cache for branch history, not + // specialization counters. Don't reset it. + && opcode != POP_JUMP_IF_FALSE + && opcode != POP_JUMP_IF_TRUE + && opcode != POP_JUMP_IF_NONE + && opcode != POP_JUMP_IF_NOT_NONE) { (&next_instr[1])->counter = trigger_backoff_counter(); } uint8_t record_func_index = _PyOpcode_RecordFunctionIndices[opcode]; diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 2c10e265590ab2..4d6f85060779f1 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -6397,7 +6397,13 @@ dummy_func( tracer->prev_state.instr_frame = frame; tracer->prev_state.instr_oparg = oparg; tracer->prev_state.instr_stacklevel = PyStackRef_IsNone(frame->f_executable) ? 2 : STACK_LEVEL(); - if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]]) { + if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]] + // Branch opcodes use the cache for branch history, not + // specialization counters. Don't reset it. + && opcode != POP_JUMP_IF_FALSE + && opcode != POP_JUMP_IF_TRUE + && opcode != POP_JUMP_IF_NONE + && opcode != POP_JUMP_IF_NOT_NONE) { (&next_instr[1])->counter = trigger_backoff_counter(); } diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 59f74439fe202c..590c0fcf3da932 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -12236,7 +12236,13 @@ tracer->prev_state.instr_frame = frame; tracer->prev_state.instr_oparg = oparg; tracer->prev_state.instr_stacklevel = PyStackRef_IsNone(frame->f_executable) ? 2 : STACK_LEVEL(); - if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]]) { + if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]] + // Branch opcodes use the cache for branch history, not + // specialization counters. Don't reset it. + && opcode != POP_JUMP_IF_FALSE + && opcode != POP_JUMP_IF_TRUE + && opcode != POP_JUMP_IF_NONE + && opcode != POP_JUMP_IF_NOT_NONE) { (&next_instr[1])->counter = trigger_backoff_counter(); } uint8_t record_func_index = _PyOpcode_RecordFunctionIndices[opcode]; diff --git a/Python/optimizer.c b/Python/optimizer.c index f09bf778587b12..0f92cc63b72a04 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -549,8 +549,6 @@ dynamic_exit_uop[MAX_UOP_ID + 1] = { }; -#define CONFIDENCE_RANGE 1000 -#define CONFIDENCE_CUTOFF 333 #ifdef Py_DEBUG #define DPRINTF(level, ...) \ @@ -598,6 +596,43 @@ add_to_trace( ((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive))) +/* Branch penalty: 0 if fully biased, FITNESS_BRANCH_BALANCED if 50/50, + * 2*FITNESS_BRANCH_BALANCED if fully against the traced direction. */ +static inline int +compute_branch_penalty(uint16_t history, bool branch_taken) +{ + int taken_count = _Py_popcount32((uint32_t)history); + int on_trace_count = branch_taken ? taken_count : 16 - taken_count; + int off_trace = 16 - on_trace_count; + return off_trace * FITNESS_BRANCH_BALANCED / 8; +} + +/* Compute exit quality for the current trace position. + * Higher values mean better places to stop the trace. */ +static inline int32_t +compute_exit_quality(_Py_CODEUNIT *target_instr, int opcode, + const _PyJitTracerState *tracer) +{ + if (target_instr == tracer->initial_state.start_instr || + target_instr == tracer->initial_state.close_loop_instr) { + return EXIT_QUALITY_CLOSE_LOOP; + } + if (target_instr->op.code == ENTER_EXECUTOR) { + return EXIT_QUALITY_ENTER_EXECUTOR; + } + if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]] > 0) { + return EXIT_QUALITY_SPECIALIZABLE; + } + return EXIT_QUALITY_DEFAULT; +} + +/* Frame penalty: (MAX_ABSTRACT_FRAME_DEPTH-1) pushes exhaust fitness. */ +static inline int32_t +compute_frame_penalty(uint16_t fitness_initial) +{ + return (int32_t)fitness_initial / (MAX_ABSTRACT_FRAME_DEPTH - 1) + 1; +} + static int is_terminator(const _PyUOpInstruction *uop) { @@ -734,13 +769,11 @@ _PyJit_translate_single_bytecode_to_trace( DPRINTF(2, "Unsupported: oparg too large\n"); unsupported: { - // Rewind to previous instruction and replace with _EXIT_TRACE. _PyUOpInstruction *curr = uop_buffer_last(trace); while (curr->opcode != _SET_IP && uop_buffer_length(trace) > 2) { trace->next--; curr = uop_buffer_last(trace); } - assert(curr->opcode == _SET_IP || uop_buffer_length(trace) == 2); if (curr->opcode == _SET_IP) { int32_t old_target = (int32_t)uop_get_target(curr); curr->opcode = _DEOPT; @@ -763,6 +796,26 @@ _PyJit_translate_single_bytecode_to_trace( return 1; } + // Stop the trace if fitness has dropped below the exit quality threshold. + _PyJitTracerTranslatorState *ts = &tracer->translator_state; + int32_t eq = compute_exit_quality(target_instr, opcode, tracer); + DPRINTF(3, "Fitness check: %s(%d) fitness=%d, exit_quality=%d, depth=%d\n", + _PyOpcode_OpName[opcode], oparg, ts->fitness, eq, ts->frame_depth); + + if (ts->fitness < eq) { + // Heuristic exit: leave operand1=0 so the side exit increments chain_depth. + ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target); + OPT_STAT_INC(fitness_terminated_traces); + DPRINTF(2, "Fitness terminated: %s(%d) fitness=%d < exit_quality=%d\n", + _PyOpcode_OpName[opcode], oparg, ts->fitness, eq); + goto done; + } + + // Snapshot the buffer before reserving tail slots. The later charge + // includes both emitted uops and capacity reserved for exits/deopts/errors. + _PyUOpInstruction *next_before = trace->next; + _PyUOpInstruction *end_before = trace->end; + // One for possible _DEOPT, one because _CHECK_VALIDITY itself might _DEOPT trace->end -= 2; @@ -789,12 +842,7 @@ _PyJit_translate_single_bytecode_to_trace( trace->end -= needs_guard_ip; int space_needed = expansion->nuops + needs_guard_ip + 2 + (!OPCODE_HAS_NO_SAVE_IP(opcode)); - if (uop_buffer_remaining_space(trace) < space_needed) { - DPRINTF(2, "No room for expansions and guards (need %d, got %d)\n", - space_needed, uop_buffer_remaining_space(trace)); - OPT_STAT_INC(trace_too_long); - goto done; - } + assert(uop_buffer_remaining_space(trace) > space_needed); ADD_TO_TRACE(_CHECK_VALIDITY, 0, 0, target); @@ -816,6 +864,12 @@ _PyJit_translate_single_bytecode_to_trace( assert(jump_happened ? (next_instr == computed_jump_instr) : (next_instr == computed_next_instr)); uint32_t uopcode = BRANCH_TO_GUARD[opcode - POP_JUMP_IF_FALSE][jump_happened]; ADD_TO_TRACE(uopcode, 0, 0, INSTR_IP(jump_happened ? computed_next_instr : computed_jump_instr, old_code)); + int bp = compute_branch_penalty(target_instr[1].cache, jump_happened); + tracer->translator_state.fitness -= bp; + DPRINTF(3, " branch penalty: -%d (history=0x%04x, taken=%d) -> fitness=%d\n", + bp, target_instr[1].cache, jump_happened, + tracer->translator_state.fitness); + break; } case JUMP_BACKWARD_JIT: @@ -823,29 +877,16 @@ _PyJit_translate_single_bytecode_to_trace( case JUMP_BACKWARD_NO_JIT: case JUMP_BACKWARD: ADD_TO_TRACE(_CHECK_PERIODIC, 0, 0, target); - _Py_FALLTHROUGH; + tracer->translator_state.fitness -= FITNESS_BACKWARD_EDGE; + DPRINTF(3, " backward edge penalty: -%d -> fitness=%d\n", + FITNESS_BACKWARD_EDGE, tracer->translator_state.fitness); + break; case JUMP_BACKWARD_NO_INTERRUPT: - { - if ((next_instr != tracer->initial_state.close_loop_instr) && - (next_instr != tracer->initial_state.start_instr) && - uop_buffer_length(&tracer->code_buffer) > CODE_SIZE_NO_PROGRESS && - // For side exits, we don't want to terminate them early. - tracer->initial_state.exit == NULL && - // These are coroutines, and we want to unroll those usually. - opcode != JUMP_BACKWARD_NO_INTERRUPT) { - // We encountered a JUMP_BACKWARD but not to the top of our own loop. - // We don't want to continue tracing as we might get stuck in the - // inner loop. Instead, end the trace where the executor of the - // inner loop might start and let the traces rejoin. - OPT_STAT_INC(inner_loop); - ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target); - uop_buffer_last(trace)->operand1 = true; // is_control_flow - DPRINTF(2, "JUMP_BACKWARD not to top ends trace %p %p %p\n", next_instr, - tracer->initial_state.close_loop_instr, tracer->initial_state.start_instr); - goto done; - } + tracer->translator_state.fitness -= FITNESS_BACKWARD_EDGE_COROUTINE; + DPRINTF(3, " coroutine backward edge penalty: -%d -> fitness=%d\n", + FITNESS_BACKWARD_EDGE_COROUTINE, + tracer->translator_state.fitness); break; - } case RESUME: case RESUME_CHECK: @@ -945,6 +986,36 @@ _PyJit_translate_single_bytecode_to_trace( assert(next->op.code == STORE_FAST); operand = next->op.arg; } + else if (uop == _PUSH_FRAME) { + _PyJitTracerTranslatorState *ts_depth = &tracer->translator_state; + ts_depth->frame_depth++; + assert(ts_depth->frame_depth < MAX_ABSTRACT_FRAME_DEPTH); + int32_t frame_penalty = compute_frame_penalty(tstate->interp->opt_config.fitness_initial); + ts_depth->fitness -= frame_penalty; + DPRINTF(3, " _PUSH_FRAME: depth=%d, penalty=-%d -> fitness=%d\n", + ts_depth->frame_depth, frame_penalty, + ts_depth->fitness); + } + else if (uop == _RETURN_VALUE || uop == _RETURN_GENERATOR || uop == _YIELD_VALUE) { + _PyJitTracerTranslatorState *ts_depth = &tracer->translator_state; + int32_t frame_penalty = compute_frame_penalty(tstate->interp->opt_config.fitness_initial); + if (ts_depth->frame_depth <= 0) { + // Returning from a frame we didn't enter — penalize. + ts_depth->fitness -= frame_penalty; + DPRINTF(3, " %s: underflow penalty=-%d -> fitness=%d\n", + _PyOpcode_uop_name[uop], frame_penalty, + ts_depth->fitness); + } + else { + // Symmetric with push: net-zero frame impact. + ts_depth->fitness += frame_penalty; + ts_depth->frame_depth--; + DPRINTF(3, " %s: return reward=+%d, depth=%d -> fitness=%d\n", + _PyOpcode_uop_name[uop], frame_penalty, + ts_depth->frame_depth, + ts_depth->fitness); + } + } else if (_PyUop_Flags[uop] & HAS_RECORDS_VALUE_FLAG) { PyObject *recorded_value = tracer->prev_state.recorded_value; tracer->prev_state.recorded_value = NULL; @@ -986,7 +1057,18 @@ _PyJit_translate_single_bytecode_to_trace( ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0); goto done; } - DPRINTF(2, "Trace continuing\n"); + // Charge fitness by trace-buffer capacity consumed for this bytecode, + // including both emitted uops and tail reservations. + { + int32_t slots_fwd = (int32_t)(trace->next - next_before); + int32_t slots_rev = (int32_t)(end_before - trace->end); + int32_t slots_used = slots_fwd + slots_rev; + tracer->translator_state.fitness -= slots_used; + DPRINTF(3, " per-insn cost: -%d (fwd=%d, rev=%d) -> fitness=%d\n", + slots_used, slots_fwd, slots_rev, + tracer->translator_state.fitness); + } + DPRINTF(2, "Trace continuing (fitness=%d)\n", tracer->translator_state.fitness); return 1; done: DPRINTF(2, "Trace done\n"); @@ -1069,6 +1151,16 @@ _PyJit_TryInitializeTracing( assert(curr_instr->op.code == JUMP_BACKWARD_JIT || curr_instr->op.code == RESUME_CHECK_JIT || (exit != NULL)); tracer->initial_state.jump_backward_instr = curr_instr; + // Reduce side-trace fitness as chain depth grows, but clamp the reduction + // after depth 4 so deeply chained exits still have at least half budget. + const _PyOptimizationConfig *cfg = &tstate->interp->opt_config; + _PyJitTracerTranslatorState *ts = &tracer->translator_state; + int effective_depth = Py_MIN(chain_depth, 4); + ts->fitness = (int32_t)((8 - effective_depth) * cfg->fitness_initial / 8); + ts->frame_depth = 0; + DPRINTF(3, "Fitness init: chain_depth=%d, fitness=%d\n", + chain_depth, ts->fitness); + tracer->is_tracing = true; return 1; } @@ -2101,7 +2193,11 @@ _PyDumpExecutors(FILE *out) fprintf(out, " node [colorscheme=greys9]\n"); PyInterpreterState *interp = PyInterpreterState_Get(); for (size_t i = 0; i < interp->executor_count; i++) { - executor_to_gv(interp->executor_ptrs[i], out); + _PyExecutorObject *exec = interp->executor_ptrs[i]; + if (exec->vm_data.code == NULL) { + continue; + } + executor_to_gv(exec, out); } fprintf(out, "}\n\n"); return 0; diff --git a/Python/pystate.c b/Python/pystate.c index 143175da0f45c7..4ef9fd4e472358 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -635,6 +635,11 @@ init_interpreter(PyInterpreterState *interp, "PYTHON_JIT_SIDE_EXIT_INITIAL_BACKOFF", SIDE_EXIT_INITIAL_BACKOFF, 0, MAX_BACKOFF); + // Trace fitness configuration + init_policy(&interp->opt_config.fitness_initial, + "PYTHON_JIT_FITNESS_INITIAL", + FITNESS_INITIAL, EXIT_QUALITY_CLOSE_LOOP, UOP_MAX_TRACE_LENGTH - 1); + interp->opt_config.specialization_enabled = !is_env_enabled("PYTHON_SPECIALIZATION_OFF"); interp->opt_config.uops_optimize_enabled = !is_env_disabled("PYTHON_UOPS_OPTIMIZE"); if (interp != &runtime->_main_interpreter) { diff --git a/Python/pystats.c b/Python/pystats.c index a057ad884566d8..2fac2db1b738c7 100644 --- a/Python/pystats.c +++ b/Python/pystats.c @@ -274,6 +274,7 @@ print_optimization_stats(FILE *out, OptimizationStats *stats) fprintf(out, "Optimization low confidence: %" PRIu64 "\n", stats->low_confidence); fprintf(out, "Optimization unknown callee: %" PRIu64 "\n", stats->unknown_callee); fprintf(out, "Executors invalidated: %" PRIu64 "\n", stats->executors_invalidated); + fprintf(out, "Optimization fitness terminated: %" PRIu64 "\n", stats->fitness_terminated_traces); print_histogram(out, "Trace length", stats->trace_length_hist); print_histogram(out, "Trace run length", stats->trace_run_length_hist);