apache
diff --git a/‎.claude/skills/doris-docker-regression/SKILL.md‎
Lines changed: 138 additions & 0 deletions b/‎.claude/skills/doris-docker-regression/SKILL.md‎
Lines changed: 138 additions & 0 deletions
diff --git a/‎.github/workflows/build-extension.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/build-extension.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/opencode-review.yml‎
Lines changed: 13 additions & 5 deletions b/‎.github/workflows/opencode-review.yml‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎be/CMakeLists.txt‎
Lines changed: 9 additions & 1 deletion b/‎be/CMakeLists.txt‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎be/src/agent/agent_server.cpp‎
Lines changed: 2 additions & 1 deletion b/‎be/src/agent/agent_server.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎be/src/agent/task_worker_pool.cpp‎
Lines changed: 47 additions & 3 deletions b/‎be/src/agent/task_worker_pool.cpp‎
Lines changed: 47 additions & 3 deletions
diff --git a/‎be/src/agent/task_worker_pool.h‎
Lines changed: 5 additions & 1 deletion b/‎be/src/agent/task_worker_pool.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎be/src/cloud/cloud_cumulative_compaction.cpp‎
Lines changed: 16 additions & 0 deletions b/‎be/src/cloud/cloud_cumulative_compaction.cpp‎
Lines changed: 16 additions & 0 deletions
@@ -0,0 +1,138 @@
+---
+name: doris-docker-regression
+description: Run Doris docker-based regression tests from a clean package
+compatibility: opencode
+---
+
+## Purpose
+
+Run Doris docker-based regression tests from a clean package, avoiding contamination from local `conf/` or startup script modifications.
+
+## Prerequisites
+
+### Python Environment
+
+Requires Python 3. Install dependencies from `docker/runtime/doris-compose/requirements.txt`:
+
+```bash
+python -m pip install --user -r docker/runtime/doris-compose/requirements.txt
+```
+
+If installation fails (especially PyYAML conflicts), use specific versions:
+```bash
+python -m pip install --user pyyaml==5.3.1 docker==6.1.3
+python -m pip install --user -r docker/runtime/doris-compose/requirements.txt
+```
+
+Alternatively, use a virtual environment:
+```bash
+python -m venv doris-compose-env
+source doris-compose-env/bin/activate
+pip install -r docker/runtime/doris-compose/requirements.txt
+```
+
+### Docker Environment
+
+```bash
+docker run hello-world       # Docker works
+docker compose version       # Compose v2
+docker-compose version       # Should resolve to Compose v2
+```
+
+If `docker-compose` shows `TypeError: kwargs_from_env()`, make sure it is indeed Docker Compose V2; if not, you need to install and point to the correct version.
+
+Check port availability:
+```bash
+lsof -nP -iTCP:8030,8040,8050,8060,8070,9010,9020,9030,9050,9060 -sTCP:LISTEN
+```
+If there is a conflict, ask the user how to handle it.
+
+## Build
+
+```bash
+./build.sh --fe --be -j<parallel_jobs> --output <output_directory>
+```
+
+## Prepare Clean Package
+
+Sanitize configs from git HEAD:
+
+```bash
+git show HEAD:conf/fe.conf > <output_directory>/fe/conf/fe.conf
+git show HEAD:conf/be.conf > <output_directory>/be/conf/be.conf
+git show HEAD:bin/start_fe.sh > <output_directory>/fe/bin/start_fe.sh
+chmod 755 <output_directory>/fe/bin/start_fe.sh
+```
+
+Verify clean configs use **default ports**:
+- `fe.conf`: 8030, 9020, 9030, 9010, 8070
+- `be.conf`: 9060, 8040, 9050, 8060, 8050
+- `start_fe.sh`: no `-agentlib:jdwp` debug agent
+
+## Build Docker Image
+
+```bash
+docker build \
+  --build-arg OUTPUT_PATH=<output_directory> \
+  -f docker/runtime/doris-compose/Dockerfile \
+  -t <image_name>:latest \
+  .
+```
+
+## Configure Regression
+
+Create `regression-test/conf/regression-conf-custom.groovy`:
+
+```groovy
+image = "<image_name>:latest"
+excludeDockerTest = false
+testGroups = "docker"
+```
+
+## Run Regression
+
+```bash
+./run-regression-test.sh --run -d <directory> -s <suite_name>
+```
+
+## Troubleshooting
+
+| Symptom | Cause | Fix |
+|---------|-------|-----|
+| 0 suites | `excludeDockerTest=true` or missing `testGroups="docker"` | Check `regression-conf-custom.groovy` |
+| JDWP error | `start_fe.sh` has debug agent | Re-sanitize from `git show HEAD:bin/start_fe.sh` |
+| Wrong ports | Configs have local edits | Re-sanitize from `git show HEAD:conf/fe.conf` / `be.conf` |
+| Port conflict | Processes using default ports | `lsof` then kill |
+
+## Debug Logs
+
+Runtime cluster at `/tmp/doris/<suite_name>/`:
+- `fe-1/log/fe.log`, `fe.warn.log`, `fe.out`, `health.out`
+- `be-1/log/be.INFO`, `be.WARNING`, `be.out`, `health.out`
+- `doris-compose.log`
+
+## Full Command Sequence
+
+```bash
+# 1. Check environment
+python --version
+docker run hello-world
+docker compose version
+lsof -nP -iTCP:8030,8040,8050,8060,8070,9010,9020,9030,9050,9060 -sTCP:LISTEN
+
+# 2. Build
+./build.sh --fe --be -j60
+
+# 3. Sanitize package configs
+git show HEAD:conf/fe.conf > <output_directory>/fe/conf/fe.conf
+git show HEAD:conf/be.conf > <output_directory>/be/conf/be.conf
+git show HEAD:bin/start_fe.sh > <output_directory>/fe/bin/start_fe.sh
+chmod 755 <output_directory>/fe/bin/start_fe.sh
+
+# 4. Build image
+docker build --build-arg OUTPUT_PATH=<output_directory> -f docker/runtime/doris-compose/Dockerfile -t <image_name>:latest .
+
+# 5. Configure (create regression-conf-custom.groovy)
+# 6. Run
+./run-regression-test.sh --run -d <directory> -s <suite_name>
+```
@@ -114,7 +114,7 @@ jobs:
         run: |
           pushd thirdparty
           branch="${{ github.base_ref }}"
-          if [[ -z "${branch}" ]] || [[ "${branch}" == 'master' || "${branch}" == 'branch-4.0' || "${branch}" == 'branch-3.0' || "${branch}" == 'branch-2.1' ]]; then
+          if [[ -z "${branch}" ]] || [[ "${branch}" == 'master' || "${branch}" == 'branch-4.1'|| "${branch}" == 'branch-4.0' || "${branch}" == 'branch-3.0' || "${branch}" == 'branch-2.1' ]]; then
             curl -L https://github.com/apache/doris-thirdparty/releases/download/automation/doris-thirdparty-prebuilt-linux-x86_64.tar.xz \
               -o doris-thirdparty-prebuilt-linux-x86_64.tar.xz
           else
 
@@ -92,6 +92,9 @@ jobs:
           - After completing the review, you MUST provide a final summary opinion based on the rules defined in AGENTS.md and the code-review skill. The summary must include conclusions for each applicable critical checkpoint.
           - If no issues to report, submit a short summary comment saying no issues found using: gh pr comment PLACEHOLDER_PR_NUMBER --body "<summary>"
           - If issues found, submit a review with inline comments plus a comprehensive summary body. Use GitHub Reviews API to ensure comments are inline:
+              - Inline comment bodies may include GitHub suggested changes blocks when you can propose a precise patch.
+              - Prefer suggested changes for small, self-contained fixes (for example typos, trivial refactors, or narrowly scoped code corrections).
+              - Do not force suggested changes for broad, architectural, or multi-file issues; explain those normally.
               - Build a JSON array of comments like: [{ "path": "<file>", "position": <diff_position>, "body": "..." }]
               - Submit via: gh api repos/PLACEHOLDER_REPO/pulls/PLACEHOLDER_PR_NUMBER/reviews --input <json_file>
               - The JSON file should contain: {"event":"COMMENT","body":"<summary>","comments":[...]}
@@ -109,14 +112,15 @@ jobs:
 
       - name: Run automated code review
         id: review
+        timeout-minutes: 55
         continue-on-error: true
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
           PROMPT=$(cat /tmp/review_prompt.txt)
 
           set +e
-          opencode run "$PROMPT" -m "github-copilot/claude-opus-4.6" 2>&1 | tee /tmp/opencode-review.log
+          opencode run "$PROMPT" -m "github-copilot/gpt-5.4" 2>&1 | tee /tmp/opencode-review.log
           status=${PIPESTATUS[0]}
           set -e
 
@@ -139,26 +143,30 @@ jobs:
           fi
 
       - name: Comment PR on review failure
-        if: ${{ steps.review.outcome == 'failure' }}
+        if: ${{ always() && steps.review.outcome != 'success' }}
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           FAILURE_REASON: ${{ steps.review.outputs.failure_reason }}
+          REVIEW_OUTCOME: ${{ steps.review.outcome }}
           RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
         run: |
+          error_msg="${FAILURE_REASON:-Review step was $REVIEW_OUTCOME (possibly timeout or cancelled)}"
           gh pr comment "${{ github.event.issue.number }}" --body "$(cat <<EOF
           OpenCode automated review failed and did not complete.
 
-          Error: ${FAILURE_REASON}
+          Error: ${error_msg}
           Workflow run: ${RUN_URL}
 
           Please inspect the workflow logs and rerun the review after the underlying issue is resolved.
           EOF
           )"
 
       - name: Fail workflow if review failed
-        if: ${{ steps.review.outcome == 'failure' }}
+        if: ${{ always() && steps.review.outcome != 'success' }}
         env:
           FAILURE_REASON: ${{ steps.review.outputs.failure_reason }}
+          REVIEW_OUTCOME: ${{ steps.review.outcome }}
         run: |
-          echo "OpenCode automated review failed: ${FAILURE_REASON}"
+          error_msg="${FAILURE_REASON:-Review step was $REVIEW_OUTCOME (possibly timeout or cancelled)}"
+          echo "OpenCode automated review failed: ${error_msg}"
           exit 1
@@ -428,7 +428,10 @@ endif()
 #   -DNDEBUG: Turn off dchecks/asserts/debug only code.
 set(CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
 message(STATUS "UBSAN_IGNORELIST is ${UBSAN_IGNORELIST}")
-set(CXX_FLAGS_ASAN "-O0 -fsanitize=address -fsanitize=undefined -fno-sanitize=float-cast-overflow -fsanitize-ignorelist=${UBSAN_IGNORELIST} -DUNDEFINED_BEHAVIOR_SANITIZER -DADDRESS_SANITIZER")
+set(CXX_FLAGS_ASAN "-O0 -fsanitize=address -fsanitize=undefined -fno-sanitize=float-cast-overflow -DUNDEFINED_BEHAVIOR_SANITIZER -DADDRESS_SANITIZER")
+if (COMPILER_CLANG)
+    set(CXX_FLAGS_ASAN "${CXX_FLAGS_ASAN} -fsanitize-ignorelist=${UBSAN_IGNORELIST}")
+endif()
 set(CXX_FLAGS_LSAN "-O0 -fsanitize=leak -DLEAK_SANITIZER")
 ## Use for BE-UT
 set(CXX_FLAGS_ASAN_UT "-O0 -fsanitize=address -DADDRESS_SANITIZER")
@@ -706,6 +709,11 @@ else()
         -lresolv
         -liconv
     )
+    # On macOS arm64, large binaries (>2GB) can cause the virtual address space
+    # to overlap with the dyld shared cache region, resulting in "dyld cache not
+    # loaded: syscall to map cache into shared region failed". Use -dead_strip to
+    # remove unreachable code/data and reduce binary size.
+    add_link_options(-Wl,-dead_strip)
 endif()
 
 
 
@@ -211,7 +211,8 @@ void AgentServer::cloud_start_workers(CloudStorageEngine& engine, ExecEnv* exec_
 
     _workers[TTaskType::ALTER] = std::make_unique<TaskWorkerPool>(
             "ALTER_TABLE", config::alter_tablet_worker_count,
-            [&engine](auto&& task) { return alter_cloud_tablet_callback(engine, task); });
+            [&engine](auto&& task) { return alter_cloud_tablet_callback(engine, task); },
+            [&engine](auto&& task) { set_alter_version_before_enqueue(engine, task); });
 
     _workers[TTaskType::CALCULATE_DELETE_BITMAP] = std::make_unique<TaskWorkerPool>(
             "CALC_DBM_TASK", config::calc_delete_bitmap_worker_count,
 
@@ -525,9 +525,11 @@ bvar::Adder<uint64_t> report_index_policy_failed("report", "index_policy_failed"
 
 } // namespace
 
-TaskWorkerPool::TaskWorkerPool(std::string_view name, int worker_count,
-                               std::function<void(const TAgentTaskRequest& task)> callback)
-        : _callback(std::move(callback)) {
+TaskWorkerPool::TaskWorkerPool(
+        std::string_view name, int worker_count,
+        std::function<void(const TAgentTaskRequest& task)> callback,
+        std::function<void(const TAgentTaskRequest& task)> pre_submit_callback)
+        : _callback(std::move(callback)), _pre_submit_callback(std::move(pre_submit_callback)) {
     auto st = ThreadPoolBuilder(fmt::format("TaskWP_{}", name))
                       .set_min_threads(worker_count)
                       .set_max_threads(worker_count)
@@ -551,6 +553,9 @@ void TaskWorkerPool::stop() {
 
 Status TaskWorkerPool::submit_task(const TAgentTaskRequest& task) {
     return _submit_task(task, [this](auto&& task) {
+        if (_pre_submit_callback) {
+            _pre_submit_callback(task);
+        }
         add_task_count(task, 1);
         return _thread_pool->submit_func([this, task]() {
             _callback(task);
@@ -2244,9 +2249,48 @@ void alter_cloud_tablet_callback(CloudStorageEngine& engine, const TAgentTaskReq
                           std::chrono::system_clock::now().time_since_epoch())
                           .count();
     g_fragment_last_active_time.set_value(now);
+
+    // Clean up alter_version before remove_task_info to avoid race:
+    // remove_task_info allows same-signature re-submit, whose pre_submit_callback
+    // would set alter_version, then this cleanup would wipe it.
+    if (req.__isset.alter_tablet_req_v2) {
+        const auto& alter_req = req.alter_tablet_req_v2;
+        auto new_tablet = engine.tablet_mgr().get_tablet(alter_req.new_tablet_id);
+        auto base_tablet = engine.tablet_mgr().get_tablet(alter_req.base_tablet_id);
+        if (new_tablet.has_value()) {
+            new_tablet.value()->set_alter_version(-1);
+        }
+        if (base_tablet.has_value()) {
+            base_tablet.value()->set_alter_version(-1);
+        }
+    }
+
     remove_task_info(req.task_type, req.signature);
 }
 
+void set_alter_version_before_enqueue(CloudStorageEngine& engine, const TAgentTaskRequest& req) {
+    if (!req.__isset.alter_tablet_req_v2) {
+        return;
+    }
+    const auto& alter_req = req.alter_tablet_req_v2;
+    if (alter_req.alter_version <= 1) {
+        return;
+    }
+    auto new_tablet = engine.tablet_mgr().get_tablet(alter_req.new_tablet_id);
+    if (!new_tablet.has_value() || new_tablet.value()->tablet_state() == TABLET_RUNNING) {
+        return;
+    }
+    auto base_tablet = engine.tablet_mgr().get_tablet(alter_req.base_tablet_id);
+    if (!base_tablet.has_value()) {
+        return;
+    }
+    new_tablet.value()->set_alter_version(alter_req.alter_version);
+    base_tablet.value()->set_alter_version(alter_req.alter_version);
+    LOG(INFO) << "set alter_version=" << alter_req.alter_version
+              << " before enqueue, base_tablet=" << alter_req.base_tablet_id
+              << ", new_tablet=" << alter_req.new_tablet_id;
+}
+
 void gc_binlog_callback(StorageEngine& engine, const TAgentTaskRequest& req) {
     std::unordered_map<int64_t, int64_t> gc_tablet_infos;
     if (!req.__isset.gc_binlog_req) {
 
@@ -50,7 +50,8 @@ class TaskWorkerPoolIf {
 class TaskWorkerPool : public TaskWorkerPoolIf {
 public:
     TaskWorkerPool(std::string_view name, int worker_count,
-                   std::function<void(const TAgentTaskRequest&)> callback);
+                   std::function<void(const TAgentTaskRequest&)> callback,
+                   std::function<void(const TAgentTaskRequest&)> pre_submit_callback = nullptr);
 
     ~TaskWorkerPool() override;
 
@@ -62,6 +63,7 @@ class TaskWorkerPool : public TaskWorkerPoolIf {
     std::atomic_bool _stopped {false};
     std::unique_ptr<ThreadPool> _thread_pool;
     std::function<void(const TAgentTaskRequest&)> _callback;
+    std::function<void(const TAgentTaskRequest&)> _pre_submit_callback;
 };
 
 class PublishVersionWorkerPool final : public TaskWorkerPool {
@@ -180,6 +182,8 @@ void alter_tablet_callback(StorageEngine& engine, const TAgentTaskRequest& req);
 
 void alter_cloud_tablet_callback(CloudStorageEngine& engine, const TAgentTaskRequest& req);
 
+void set_alter_version_before_enqueue(CloudStorageEngine& engine, const TAgentTaskRequest& req);
+
 void clone_callback(StorageEngine& engine, const ClusterInfo* cluster_info,
                     const TAgentTaskRequest& req);
 
 
@@ -292,6 +292,22 @@ Status CloudCumulativeCompaction::modify_rowsets() {
         LOG(INFO) << "CloudCumulativeCompaction::modify_rowsets.enable_spin_wait, exit";
     });
 
+    // Block only NOTREADY tablets (SC new tablets) before compaction commit.
+    // RUNNING tablets (system tables, base tablets) are not affected.
+    DBUG_EXECUTE_IF("CloudCumulativeCompaction::modify_rowsets.block_notready", {
+        if (_tablet->tablet_state() == TABLET_NOTREADY) {
+            LOG(INFO) << "block NOTREADY tablet compaction before commit"
+                      << ", tablet_id=" << _tablet->tablet_id() << ", output=["
+                      << _input_rowsets.front()->start_version() << "-"
+                      << _input_rowsets.back()->end_version() << "]";
+            while (DebugPoints::instance()->is_enable(
+                    "CloudCumulativeCompaction::modify_rowsets.block_notready")) {
+                std::this_thread::sleep_for(std::chrono::milliseconds(50));
+            }
+            LOG(INFO) << "release NOTREADY tablet compaction, tablet_id=" << _tablet->tablet_id();
+        }
+    });
+
     DeleteBitmapPtr output_rowset_delete_bitmap = nullptr;
     int64_t initiator = this->initiator();
     int64_t get_delete_bitmap_lock_start_time = 0;