diff --git a/esphome/components/esp32/core.cpp b/esphome/components/esp32/core.cpp
index 59b791da403..7ebbba609e8 100644
--- a/esphome/components/esp32/core.cpp
+++ b/esphome/components/esp32/core.cpp
@@ -22,8 +22,8 @@ extern "C" __attribute__((weak)) void initArduino() {}
 namespace esphome {
 
 void HOT yield() { vPortYield(); }
-uint32_t IRAM_ATTR HOT millis() { return (uint32_t) (esp_timer_get_time() / 1000ULL); }
-uint64_t HOT millis_64() { return static_cast<uint64_t>(esp_timer_get_time()) / 1000ULL; }
+uint32_t IRAM_ATTR HOT millis() { return micros_to_millis(static_cast<uint64_t>(esp_timer_get_time())); }
+uint64_t HOT millis_64() { return micros_to_millis<uint64_t>(static_cast<uint64_t>(esp_timer_get_time())); }
 void HOT delay(uint32_t ms) { vTaskDelay(ms / portTICK_PERIOD_MS); }
 uint32_t IRAM_ATTR HOT micros() { return (uint32_t) esp_timer_get_time(); }
 void IRAM_ATTR HOT delayMicroseconds(uint32_t us) { delay_microseconds_safe(us); }
diff --git a/esphome/components/rp2040/core.cpp b/esphome/components/rp2040/core.cpp
index 6386d53292f..a15ee7e2635 100644
--- a/esphome/components/rp2040/core.cpp
+++ b/esphome/components/rp2040/core.cpp
@@ -11,8 +11,8 @@
 namespace esphome {
 
 void HOT yield() { ::yield(); }
-uint64_t millis_64() { return time_us_64() / 1000ULL; }
-uint32_t HOT millis() { return static_cast<uint32_t>(millis_64()); }
+uint64_t millis_64() { return micros_to_millis<uint64_t>(time_us_64()); }
+uint32_t HOT millis() { return micros_to_millis(time_us_64()); }
 void HOT delay(uint32_t ms) { ::delay(ms); }
 uint32_t HOT micros() { return ::micros(); }
 void HOT delayMicroseconds(uint32_t us) { delay_microseconds_safe(us); }
diff --git a/esphome/core/helpers.h b/esphome/core/helpers.h
index c68cb549bb4..ae505a2d8a0 100644
--- a/esphome/core/helpers.h
+++ b/esphome/core/helpers.h
@@ -599,6 +599,44 @@ template<std::integral T> constexpr uint32_t fnv1a_hash_extend(uint32_t hash, T
 constexpr uint32_t fnv1a_hash(const char *str) { return fnv1a_hash_extend(FNV1_OFFSET_BASIS, str); }
 inline uint32_t fnv1a_hash(const std::string &str) { return fnv1a_hash(str.c_str()); }
 
+/// Convert a 64-bit microsecond count to milliseconds without calling
+/// __udivdi3 (software 64-bit divide, ~1200 ns on Xtensa @ 240 MHz).
+///
+/// Returns uint32_t by default (for millis()), or uint64_t when requested
+/// (for millis_64()). The only difference is whether hi * Q is truncated
+/// to 32 bits or widened to 64.
+///
+/// On 32-bit targets, GCC does not optimize 64-bit constant division into a
+/// multiply-by-reciprocal. Since 1000 = 8 * 125, we first right-shift by 3
+/// (free divide-by-8), then use the Euclidean division identity to decompose
+/// the remaining 64-bit divide-by-125 into a single 32-bit division:
+///
+///   floor(us / 1000) = floor(floor(us / 8) / 125)    [exact for integers]
+///   2^32 = Q * 125 + R  (34359738 * 125 + 46)
+///   (hi * 2^32 + lo) / 125 = hi * Q + (hi * R + lo) / 125
+///
+/// GCC optimizes the remaining 32-bit "/ 125U" into a multiply-by-reciprocal
+/// (mulhu + shift), so no division instruction is emitted.
+///
+/// Safe for us up to ~3.2e18 (~101,700 years of microseconds).
+///
+/// See: https://en.wikipedia.org/wiki/Euclidean_division
+/// See: https://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html
+template<typename ReturnT = uint32_t> inline constexpr ESPHOME_ALWAYS_INLINE ReturnT micros_to_millis(uint64_t us) {
+  constexpr uint32_t d = 125U;
+  constexpr uint32_t q = static_cast<uint32_t>((1ULL << 32) / d);  // 34359738
+  constexpr uint32_t r = static_cast<uint32_t>((1ULL << 32) % d);  // 46
+  // 1000 = 8 * 125; divide-by-8 is a free shift
+  uint64_t x = us >> 3;
+  uint32_t lo = static_cast<uint32_t>(x);
+  uint32_t hi = static_cast<uint32_t>(x >> 32);
+  // Combine remainder term: hi * (2^32 % 125) + lo
+  uint32_t adj = hi * r + lo;
+  // If adj overflowed, the true value is 2^32 + adj; apply the identity again
+  // static_cast<ReturnT>(hi) widens to 64-bit when ReturnT=uint64_t, preserving upper bits of hi*q
+  return static_cast<ReturnT>(hi) * q + (adj < lo ? (adj + r) / d + q : adj / d);
+}
+
 /// Return a random 32-bit unsigned integer.
 uint32_t random_uint32();
 /// Return a random float between 0 and 1.
diff --git a/tests/integration/fixtures/micros_to_millis.yaml b/tests/integration/fixtures/micros_to_millis.yaml
new file mode 100644
index 00000000000..d11808c43a2
--- /dev/null
+++ b/tests/integration/fixtures/micros_to_millis.yaml
@@ -0,0 +1,61 @@
+esphome:
+  name: micros-to-millis-test
+  platformio_options:
+    build_flags:
+      - "-DDEBUG"
+  on_boot:
+    - lambda: |-
+        using esphome::micros_to_millis;
+        const char *TAG = "MTM";
+        int pass = 0, fail = 0;
+
+        auto check = [&](const char *name, uint64_t us) {
+          uint32_t got = micros_to_millis(us);
+          uint32_t want = (uint32_t)(us / 1000ULL);
+          if (got == want) { pass++; }
+          else { ESP_LOGE(TAG, "%s FAILED: got=%u want=%u", name, got, want); fail++; }
+        };
+
+        // Basic values
+        check("zero", 0);
+        check("below_1ms", 999);
+        check("exactly_1ms", 1000);
+        check("above_1ms", 1001);
+
+        // Shift boundary (1000 = 8 * 125, exercises the >>3 shift)
+        check("shift_7999", 7999);
+        check("shift_8000", 8000);
+        check("shift_8001", 8001);
+
+        // 32-bit boundary
+        check("u32max_minus1", 0xFFFFFFFEULL);
+        check("u32max", 0xFFFFFFFFULL);
+        check("u32max_plus1", 0x100000000ULL);
+
+        // Realistic uptimes
+        check("30_days", 2592000000000ULL);
+        check("1_year", 31536000000000ULL);
+
+        // Carry path: construct x = us>>3 with specific hi/lo that trigger adj overflow
+        { uint64_t x = (603ULL << 32) | 0xFFFFFFFFU; check("carry_603", x << 3); }
+        { uint64_t x = (5000ULL << 32) | 0xFFFFFFFFU; check("carry_5000", x << 3); }
+
+        // Carry boundary: exact transition where adj overflows (hi=1000, R=46)
+        {
+          uint32_t hi = 1000;
+          uint32_t thr = 0xFFFFFFFFU - hi * 46U;
+          uint64_t h = (uint64_t)hi << 32;
+          check("carry_before", (h | (thr - 1)) << 3);
+          check("carry_at", (h | thr) << 3);
+          check("carry_after", (h | (thr + 1)) << 3);
+        }
+
+        // Mod-8 variations (exercises the >>3 truncation)
+        for (int i = 0; i < 8; i++) { check("mod8", 2592000000000ULL + i); }
+
+        if (fail == 0) { ESP_LOGI(TAG, "ALL_PASSED %d tests", pass); }
+        else { ESP_LOGE(TAG, "%d FAILED out of %d", fail, pass + fail); }
+
+host:
+api:
+logger:
diff --git a/tests/integration/test_micros_to_millis.py b/tests/integration/test_micros_to_millis.py
new file mode 100644
index 00000000000..9960d6b017f
--- /dev/null
+++ b/tests/integration/test_micros_to_millis.py
@@ -0,0 +1,46 @@
+"""Integration test for micros_to_millis Euclidean decomposition."""
+
+from __future__ import annotations
+
+import asyncio
+import re
+
+import pytest
+
+from .types import APIClientConnectedFactory, RunCompiledFunction
+
+
+@pytest.mark.asyncio
+async def test_micros_to_millis(
+    yaml_config: str,
+    run_compiled: RunCompiledFunction,
+    api_client_connected: APIClientConnectedFactory,
+) -> None:
+    """Test that micros_to_millis matches reference uint64 division."""
+
+    all_passed = asyncio.Event()
+    failures: list[str] = []
+
+    def on_log_line(line: str) -> None:
+        clean_line = re.sub(r"\x1b\[[0-9;]*m", "", line)
+        if "ALL_PASSED" in clean_line:
+            all_passed.set()
+        elif "FAILED" in clean_line and "[MTM" in clean_line:
+            failures.append(clean_line)
+
+    async with (
+        run_compiled(yaml_config, line_callback=on_log_line),
+        api_client_connected() as client,
+    ):
+        device_info = await client.device_info()
+        assert device_info is not None
+        assert device_info.name == "micros-to-millis-test"
+
+        try:
+            await asyncio.wait_for(all_passed.wait(), timeout=2.0)
+        except TimeoutError:
+            if failures:
+                pytest.fail(f"micros_to_millis failures: {failures}")
+            pytest.fail("micros_to_millis test timed out")
+
+        assert not failures, f"micros_to_millis failures: {failures}"