mirror of
https://github.com/pchuan98/codex.git
synced 2026-07-01 00:31:56 +08:00
Improve GitHub issue deduplication reliability by introducing a stage… (#11769)
…d two-pass Codex search strategy with deterministic fallback behavior,
and remove an obsolete prompt file that was no longer used.
### Changes
- Updated `workflows/issue-deduplicator.yml`:
- Added richer issue input fields (`state`, `updatedAt`, `labels`) for
model context.
- Added two candidate pools:
- `codex-existing-issues-all.json` (`--state all`)
- `codex-existing-issues-open.json` (`--state open`)
- Added body truncation during JSON preparation to reduce prompt noise.
- Added **Pass 1** Codex run over all issues.
- Added normalization/validation step for Pass 1 output:
- tolerant JSON parsing
- self-issue filtering
- deduplication
- cap to 5 results
- Added **Pass 2 fallback** Codex run over open issues only, triggered
only when Pass 1 has no usable matches.
- Added normalization/validation step for Pass 2 output (same
filtering/dedup/cap behavior).
- Added final deterministic selector:
- prefer pass 2 if it finds matches
- otherwise use pass 1
- otherwise return no matches
- Added observability logs:
- pool sizes
- per-pass parse/match status
- final pass selected and final duplicate count
- Kept public issue-comment format unchanged.
- Added comment documenting that prompt text now lives inline in
workflow.
- Deleted obsolete file:
- `/prompts/issue-deduplicator.txt`
### Behavior Impact
- Better duplicate recall when broad search fails by retrying against
active issues only.
- More deterministic/noise-resistant output handling.
- No change to workflow trigger conditions, permissions, or issue
comment structure.
This commit is contained in:
committed by
GitHub
Unverified
parent
e71760fc64
commit
ffef5ce5de
@@ -1,18 +0,0 @@
|
|||||||
You are an assistant that triages new GitHub issues by identifying potential duplicates.
|
|
||||||
|
|
||||||
You will receive the following JSON files located in the current working directory:
|
|
||||||
- `codex-current-issue.json`: JSON object describing the newly created issue (fields: number, title, body).
|
|
||||||
- `codex-existing-issues.json`: JSON array of recent issues (each element includes number, title, body, createdAt).
|
|
||||||
|
|
||||||
Instructions:
|
|
||||||
- Load both files as JSON and review their contents carefully. The codex-existing-issues.json file is large, ensure you explore all of it.
|
|
||||||
- Compare the current issue against the existing issues to find up to five that appear to describe the same underlying problem or request.
|
|
||||||
- Only consider an issue a potential duplicate if there is a clear overlap in symptoms, feature requests, reproduction steps, or error messages.
|
|
||||||
- Prioritize newer issues when similarity is comparable.
|
|
||||||
- Ignore pull requests and issues whose similarity is tenuous.
|
|
||||||
- When unsure, prefer returning fewer matches.
|
|
||||||
|
|
||||||
Output requirements:
|
|
||||||
- Respond with a JSON array of issue numbers (integers), ordered from most likely duplicate to least.
|
|
||||||
- Include at most five numbers.
|
|
||||||
- If you find no plausible duplicates, respond with `[]`.
|
|
||||||
@@ -15,34 +15,68 @@ jobs:
|
|||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
outputs:
|
outputs:
|
||||||
codex_output: ${{ steps.codex.outputs.final-message }}
|
codex_output: ${{ steps.select-final.outputs.codex_output }}
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v6
|
- uses: actions/checkout@v6
|
||||||
|
|
||||||
- name: Prepare Codex inputs
|
- name: Prepare Codex inputs
|
||||||
env:
|
env:
|
||||||
GH_TOKEN: ${{ github.token }}
|
GH_TOKEN: ${{ github.token }}
|
||||||
|
REPO: ${{ github.repository }}
|
||||||
|
ISSUE_NUMBER: ${{ github.event.issue.number }}
|
||||||
run: |
|
run: |
|
||||||
set -eo pipefail
|
set -eo pipefail
|
||||||
|
|
||||||
CURRENT_ISSUE_FILE=codex-current-issue.json
|
CURRENT_ISSUE_FILE=codex-current-issue.json
|
||||||
EXISTING_ISSUES_FILE=codex-existing-issues.json
|
EXISTING_ALL_FILE=codex-existing-issues-all.json
|
||||||
|
EXISTING_OPEN_FILE=codex-existing-issues-open.json
|
||||||
|
|
||||||
gh issue list --repo "${{ github.repository }}" \
|
gh issue list --repo "$REPO" \
|
||||||
--json number,title,body,createdAt \
|
--json number,title,body,createdAt,updatedAt,state,labels \
|
||||||
--limit 1000 \
|
--limit 1000 \
|
||||||
--state all \
|
--state all \
|
||||||
--search "sort:created-desc" \
|
--search "sort:created-desc" \
|
||||||
| jq '.' \
|
| jq '[.[] | {
|
||||||
> "$EXISTING_ISSUES_FILE"
|
number,
|
||||||
|
title,
|
||||||
|
body: ((.body // "")[0:4000]),
|
||||||
|
createdAt,
|
||||||
|
updatedAt,
|
||||||
|
state,
|
||||||
|
labels: ((.labels // []) | map(.name))
|
||||||
|
}]' \
|
||||||
|
> "$EXISTING_ALL_FILE"
|
||||||
|
|
||||||
gh issue view "${{ github.event.issue.number }}" \
|
gh issue list --repo "$REPO" \
|
||||||
--repo "${{ github.repository }}" \
|
--json number,title,body,createdAt,updatedAt,state,labels \
|
||||||
|
--limit 1000 \
|
||||||
|
--state open \
|
||||||
|
--search "sort:created-desc" \
|
||||||
|
| jq '[.[] | {
|
||||||
|
number,
|
||||||
|
title,
|
||||||
|
body: ((.body // "")[0:4000]),
|
||||||
|
createdAt,
|
||||||
|
updatedAt,
|
||||||
|
state,
|
||||||
|
labels: ((.labels // []) | map(.name))
|
||||||
|
}]' \
|
||||||
|
> "$EXISTING_OPEN_FILE"
|
||||||
|
|
||||||
|
gh issue view "$ISSUE_NUMBER" \
|
||||||
|
--repo "$REPO" \
|
||||||
--json number,title,body \
|
--json number,title,body \
|
||||||
| jq '.' \
|
| jq '{number, title, body: ((.body // "")[0:4000])}' \
|
||||||
> "$CURRENT_ISSUE_FILE"
|
> "$CURRENT_ISSUE_FILE"
|
||||||
|
|
||||||
- id: codex
|
echo "Prepared duplicate detection input files."
|
||||||
|
echo "all_issue_count=$(jq 'length' "$EXISTING_ALL_FILE")"
|
||||||
|
echo "open_issue_count=$(jq 'length' "$EXISTING_OPEN_FILE")"
|
||||||
|
|
||||||
|
# Prompt instructions are intentionally inline in this workflow. The old
|
||||||
|
# .github/prompts/issue-deduplicator.txt file is obsolete and removed.
|
||||||
|
- id: codex-all
|
||||||
|
name: Find duplicates (pass 1, all issues)
|
||||||
uses: openai/codex-action@main
|
uses: openai/codex-action@main
|
||||||
with:
|
with:
|
||||||
openai-api-key: ${{ secrets.CODEX_OPENAI_API_KEY }}
|
openai-api-key: ${{ secrets.CODEX_OPENAI_API_KEY }}
|
||||||
@@ -52,14 +86,17 @@ jobs:
|
|||||||
|
|
||||||
You will receive the following JSON files located in the current working directory:
|
You will receive the following JSON files located in the current working directory:
|
||||||
- `codex-current-issue.json`: JSON object describing the newly created issue (fields: number, title, body).
|
- `codex-current-issue.json`: JSON object describing the newly created issue (fields: number, title, body).
|
||||||
- `codex-existing-issues.json`: JSON array of recent issues (each element includes number, title, body, createdAt).
|
- `codex-existing-issues-all.json`: JSON array of recent issues with states, timestamps, and labels.
|
||||||
|
|
||||||
Instructions:
|
Instructions:
|
||||||
- Compare the current issue against the existing issues to find up to five that appear to describe the same underlying problem or request.
|
- Compare the current issue against the existing issues to find up to five that appear to describe the same underlying problem or request.
|
||||||
- Focus on the underlying intent and context of each issue—such as reported symptoms, feature requests, reproduction steps, or error messages—rather than relying solely on string similarity or synthetic metrics.
|
- Prioritize concrete overlap in symptoms, reproduction details, error signatures, and user intent.
|
||||||
- After your analysis, validate your results in 1-2 lines explaining your decision to return the selected matches.
|
- Prefer active unresolved issues when confidence is similar.
|
||||||
- When unsure, prefer returning fewer matches.
|
- Closed issues can still be valid duplicates if they clearly match.
|
||||||
- Include at most five numbers.
|
- Return fewer matches rather than speculative ones.
|
||||||
|
- If confidence is low, return an empty list.
|
||||||
|
- Include at most five issue numbers.
|
||||||
|
- After analysis, provide a short reason for your decision.
|
||||||
|
|
||||||
output-schema: |
|
output-schema: |
|
||||||
{
|
{
|
||||||
@@ -77,6 +114,179 @@ jobs:
|
|||||||
"additionalProperties": false
|
"additionalProperties": false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
- id: normalize-all
|
||||||
|
name: Normalize pass 1 output
|
||||||
|
env:
|
||||||
|
CODEX_OUTPUT: ${{ steps.codex-all.outputs.final-message }}
|
||||||
|
CURRENT_ISSUE_NUMBER: ${{ github.event.issue.number }}
|
||||||
|
run: |
|
||||||
|
set -eo pipefail
|
||||||
|
|
||||||
|
raw=${CODEX_OUTPUT//$'\r'/}
|
||||||
|
parsed=false
|
||||||
|
issues='[]'
|
||||||
|
reason=''
|
||||||
|
|
||||||
|
if [ -n "$raw" ] && printf '%s' "$raw" | jq -e 'type == "object" and (.issues | type == "array")' >/dev/null 2>&1; then
|
||||||
|
parsed=true
|
||||||
|
issues=$(printf '%s' "$raw" | jq -c '[.issues[] | tostring]')
|
||||||
|
reason=$(printf '%s' "$raw" | jq -r '.reason // ""')
|
||||||
|
else
|
||||||
|
reason='Pass 1 output was empty or invalid JSON.'
|
||||||
|
fi
|
||||||
|
|
||||||
|
filtered=$(jq -cn --argjson issues "$issues" --arg current "$CURRENT_ISSUE_NUMBER" '[
|
||||||
|
$issues[]
|
||||||
|
| tostring
|
||||||
|
| select(. != $current)
|
||||||
|
] | reduce .[] as $issue ([]; if index($issue) then . else . + [$issue] end) | .[:5]')
|
||||||
|
|
||||||
|
has_matches=false
|
||||||
|
if [ "$(jq 'length' <<< "$filtered")" -gt 0 ]; then
|
||||||
|
has_matches=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Pass 1 parsed: $parsed"
|
||||||
|
echo "Pass 1 matches after filtering: $(jq 'length' <<< "$filtered")"
|
||||||
|
echo "Pass 1 reason: $reason"
|
||||||
|
|
||||||
|
{
|
||||||
|
echo "issues_json=$filtered"
|
||||||
|
echo "reason<<EOF"
|
||||||
|
echo "$reason"
|
||||||
|
echo "EOF"
|
||||||
|
echo "has_matches=$has_matches"
|
||||||
|
} >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
|
- id: codex-open
|
||||||
|
name: Find duplicates (pass 2, open issues)
|
||||||
|
if: ${{ steps.normalize-all.outputs.has_matches != 'true' }}
|
||||||
|
uses: openai/codex-action@main
|
||||||
|
with:
|
||||||
|
openai-api-key: ${{ secrets.CODEX_OPENAI_API_KEY }}
|
||||||
|
allow-users: "*"
|
||||||
|
prompt: |
|
||||||
|
You are an assistant that triages new GitHub issues by identifying potential duplicates.
|
||||||
|
|
||||||
|
This is a fallback pass because a broad search did not find convincing matches.
|
||||||
|
|
||||||
|
You will receive the following JSON files located in the current working directory:
|
||||||
|
- `codex-current-issue.json`: JSON object describing the newly created issue (fields: number, title, body).
|
||||||
|
- `codex-existing-issues-open.json`: JSON array of open issues only.
|
||||||
|
|
||||||
|
Instructions:
|
||||||
|
- Search only these active unresolved issues for duplicates of the current issue.
|
||||||
|
- Prioritize concrete overlap in symptoms, reproduction details, error signatures, and user intent.
|
||||||
|
- Prefer fewer, higher-confidence matches.
|
||||||
|
- If confidence is low, return an empty list.
|
||||||
|
- Include at most five issue numbers.
|
||||||
|
- After analysis, provide a short reason for your decision.
|
||||||
|
|
||||||
|
output-schema: |
|
||||||
|
{
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"issues": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"reason": { "type": "string" }
|
||||||
|
},
|
||||||
|
"required": ["issues", "reason"],
|
||||||
|
"additionalProperties": false
|
||||||
|
}
|
||||||
|
|
||||||
|
- id: normalize-open
|
||||||
|
name: Normalize pass 2 output
|
||||||
|
if: ${{ steps.normalize-all.outputs.has_matches != 'true' }}
|
||||||
|
env:
|
||||||
|
CODEX_OUTPUT: ${{ steps.codex-open.outputs.final-message }}
|
||||||
|
CURRENT_ISSUE_NUMBER: ${{ github.event.issue.number }}
|
||||||
|
run: |
|
||||||
|
set -eo pipefail
|
||||||
|
|
||||||
|
raw=${CODEX_OUTPUT//$'\r'/}
|
||||||
|
parsed=false
|
||||||
|
issues='[]'
|
||||||
|
reason=''
|
||||||
|
|
||||||
|
if [ -n "$raw" ] && printf '%s' "$raw" | jq -e 'type == "object" and (.issues | type == "array")' >/dev/null 2>&1; then
|
||||||
|
parsed=true
|
||||||
|
issues=$(printf '%s' "$raw" | jq -c '[.issues[] | tostring]')
|
||||||
|
reason=$(printf '%s' "$raw" | jq -r '.reason // ""')
|
||||||
|
else
|
||||||
|
reason='Pass 2 output was empty or invalid JSON.'
|
||||||
|
fi
|
||||||
|
|
||||||
|
filtered=$(jq -cn --argjson issues "$issues" --arg current "$CURRENT_ISSUE_NUMBER" '[
|
||||||
|
$issues[]
|
||||||
|
| tostring
|
||||||
|
| select(. != $current)
|
||||||
|
] | reduce .[] as $issue ([]; if index($issue) then . else . + [$issue] end) | .[:5]')
|
||||||
|
|
||||||
|
has_matches=false
|
||||||
|
if [ "$(jq 'length' <<< "$filtered")" -gt 0 ]; then
|
||||||
|
has_matches=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Pass 2 parsed: $parsed"
|
||||||
|
echo "Pass 2 matches after filtering: $(jq 'length' <<< "$filtered")"
|
||||||
|
echo "Pass 2 reason: $reason"
|
||||||
|
|
||||||
|
{
|
||||||
|
echo "issues_json=$filtered"
|
||||||
|
echo "reason<<EOF"
|
||||||
|
echo "$reason"
|
||||||
|
echo "EOF"
|
||||||
|
echo "has_matches=$has_matches"
|
||||||
|
} >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
|
- id: select-final
|
||||||
|
name: Select final duplicate set
|
||||||
|
env:
|
||||||
|
PASS1_ISSUES: ${{ steps.normalize-all.outputs.issues_json }}
|
||||||
|
PASS1_REASON: ${{ steps.normalize-all.outputs.reason }}
|
||||||
|
PASS2_ISSUES: ${{ steps.normalize-open.outputs.issues_json }}
|
||||||
|
PASS2_REASON: ${{ steps.normalize-open.outputs.reason }}
|
||||||
|
PASS1_HAS_MATCHES: ${{ steps.normalize-all.outputs.has_matches }}
|
||||||
|
PASS2_HAS_MATCHES: ${{ steps.normalize-open.outputs.has_matches }}
|
||||||
|
run: |
|
||||||
|
set -eo pipefail
|
||||||
|
|
||||||
|
selected_issues='[]'
|
||||||
|
selected_reason='No plausible duplicates found.'
|
||||||
|
selected_pass='none'
|
||||||
|
|
||||||
|
if [ "$PASS1_HAS_MATCHES" = "true" ]; then
|
||||||
|
selected_issues=${PASS1_ISSUES:-'[]'}
|
||||||
|
selected_reason=${PASS1_REASON:-'Pass 1 found duplicates.'}
|
||||||
|
selected_pass='all'
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$PASS2_HAS_MATCHES" = "true" ]; then
|
||||||
|
selected_issues=${PASS2_ISSUES:-'[]'}
|
||||||
|
selected_reason=${PASS2_REASON:-'Pass 2 found duplicates.'}
|
||||||
|
selected_pass='open-fallback'
|
||||||
|
fi
|
||||||
|
|
||||||
|
final_json=$(jq -cn \
|
||||||
|
--argjson issues "$selected_issues" \
|
||||||
|
--arg reason "$selected_reason" \
|
||||||
|
--arg pass "$selected_pass" \
|
||||||
|
'{issues: $issues, reason: $reason, pass: $pass}')
|
||||||
|
|
||||||
|
echo "Final pass used: $selected_pass"
|
||||||
|
echo "Final duplicate count: $(jq '.issues | length' <<< "$final_json")"
|
||||||
|
echo "Final reason: $(jq -r '.reason' <<< "$final_json")"
|
||||||
|
|
||||||
|
{
|
||||||
|
echo "codex_output<<EOF"
|
||||||
|
echo "$final_json"
|
||||||
|
echo "EOF"
|
||||||
|
} >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
comment-on-issue:
|
comment-on-issue:
|
||||||
name: Comment with potential duplicates
|
name: Comment with potential duplicates
|
||||||
needs: gather-duplicates
|
needs: gather-duplicates
|
||||||
@@ -105,11 +315,17 @@ jobs:
|
|||||||
|
|
||||||
const issues = Array.isArray(parsed?.issues) ? parsed.issues : [];
|
const issues = Array.isArray(parsed?.issues) ? parsed.issues : [];
|
||||||
const currentIssueNumber = String(context.payload.issue.number);
|
const currentIssueNumber = String(context.payload.issue.number);
|
||||||
|
const passUsed = typeof parsed?.pass === 'string' ? parsed.pass : 'unknown';
|
||||||
|
const reason = typeof parsed?.reason === 'string' ? parsed.reason : '';
|
||||||
|
|
||||||
console.log(`Current issue number: ${currentIssueNumber}`);
|
console.log(`Current issue number: ${currentIssueNumber}`);
|
||||||
|
console.log(`Pass used: ${passUsed}`);
|
||||||
|
if (reason) {
|
||||||
|
console.log(`Reason: ${reason}`);
|
||||||
|
}
|
||||||
console.log(issues);
|
console.log(issues);
|
||||||
|
|
||||||
const filteredIssues = issues.filter((value) => String(value) !== currentIssueNumber);
|
const filteredIssues = [...new Set(issues.map((value) => String(value)))].filter((value) => value !== currentIssueNumber).slice(0, 5);
|
||||||
|
|
||||||
if (filteredIssues.length === 0) {
|
if (filteredIssues.length === 0) {
|
||||||
core.info('Codex reported no potential duplicates.');
|
core.info('Codex reported no potential duplicates.');
|
||||||
|
|||||||
Reference in New Issue
Block a user