Step 1: GitHub Webhook & PR Parsing
Set up a webhook server to receive GitHub pull request events, parse unified diffs to extract changed code, and filter files so the LLM only reviews what matters.
How GitHub Webhooks Work
When you configure a webhook on a GitHub repository, GitHub sends an HTTP POST request to your server every time a specified event occurs. For our code review tool, we care about the pull_request event, specifically when a PR is opened, synchronized (new commits pushed), or reopened.
The webhook payload contains metadata about the PR (title, author, base branch, head branch) but not the actual diff. We need a second API call to fetch the diff content.
Setting Up the Webhook Handler
Update src/server.js to handle pull request events properly:
// src/server.js - Updated webhook handler
require('dotenv').config();
const express = require('express');
const crypto = require('crypto');
const { handlePullRequest } = require('./github');
const app = express();
const PORT = process.env.PORT || 3000;
// IMPORTANT: We need the raw body for signature verification
app.use(express.json({
verify: (req, res, buf) => {
req.rawBody = buf;
}
}));
// Verify GitHub webhook signature
function verifySignature(req) {
const signature = req.headers['x-hub-signature-256'];
if (!signature) return false;
const hmac = crypto.createHmac('sha256', process.env.GITHUB_WEBHOOK_SECRET);
const digest = 'sha256=' + hmac.update(req.rawBody).digest('hex');
try {
return crypto.timingSafeEqual(
Buffer.from(signature),
Buffer.from(digest)
);
} catch {
return false;
}
}
app.get('/health', (req, res) => {
res.json({ status: 'ok', timestamp: new Date().toISOString() });
});
app.post('/webhook', async (req, res) => {
// 1. Verify the webhook signature
if (process.env.GITHUB_WEBHOOK_SECRET && !verifySignature(req)) {
console.error('Invalid webhook signature');
return res.status(401).json({ error: 'Invalid signature' });
}
// 2. Check the event type
const event = req.headers['x-github-event'];
if (event !== 'pull_request') {
return res.status(200).json({ ignored: true, event });
}
// 3. Only process relevant actions
const { action, pull_request, repository } = req.body;
const validActions = ['opened', 'synchronize', 'reopened'];
if (!validActions.includes(action)) {
return res.status(200).json({ ignored: true, action });
}
// 4. Respond immediately (GitHub times out after 10 seconds)
res.status(202).json({ processing: true });
// 5. Process the PR asynchronously
try {
console.log(`Processing PR #${pull_request.number}: ${pull_request.title}`);
await handlePullRequest({
owner: repository.owner.login,
repo: repository.name,
pullNumber: pull_request.number,
headSha: pull_request.head.sha,
});
console.log(`Finished reviewing PR #${pull_request.number}`);
} catch (error) {
console.error(`Error reviewing PR #${pull_request.number}:`, error);
}
});
app.listen(PORT, () => {
console.log(`AI Code Reviewer listening on port ${PORT}`);
});
202 Accepted immediately, then process asynchronously. GitHub will retry webhooks if your server does not respond within 10 seconds, which could cause duplicate reviews.Building the GitHub API Client
Create src/github.js to handle all GitHub API interactions:
// src/github.js
const { Octokit } = require('@octokit/rest');
const { parseDiff } = require('./diff-parser');
const { analyzeCode } = require('./analyzer');
const { postReviewComments } = require('./commenter');
const octokit = new Octokit({
auth: process.env.GITHUB_TOKEN,
});
async function handlePullRequest({ owner, repo, pullNumber, headSha }) {
// Step 1: Fetch the PR diff
const diff = await fetchPRDiff(owner, repo, pullNumber);
// Step 2: Parse the diff into structured file changes
const files = parseDiff(diff);
// Step 3: Filter out files we do not want to review
const reviewableFiles = filterFiles(files);
if (reviewableFiles.length === 0) {
console.log('No reviewable files found in this PR');
return;
}
console.log(`Found ${reviewableFiles.length} reviewable files`);
// Step 4: Analyze with LLM (Lesson 3)
const issues = await analyzeCode(reviewableFiles);
// Step 5: Post comments (Lesson 4)
if (issues.length > 0) {
await postReviewComments({ owner, repo, pullNumber, headSha, issues });
}
}
async function fetchPRDiff(owner, repo, pullNumber) {
const response = await octokit.pulls.get({
owner,
repo,
pull_number: pullNumber,
mediaType: {
format: 'diff',
},
});
return response.data;
}
async function fetchPRFiles(owner, repo, pullNumber) {
const response = await octokit.pulls.listFiles({
owner,
repo,
pull_number: pullNumber,
per_page: 100,
});
return response.data;
}
// File extensions we want to review
const REVIEWABLE_EXTENSIONS = [
'.js', '.ts', '.jsx', '.tsx',
'.py', '.rb', '.go', '.rs',
'.java', '.kt', '.cs', '.cpp',
'.c', '.h', '.swift', '.php',
];
// Files and directories to skip
const SKIP_PATTERNS = [
/node_modules\//,
/vendor\//,
/dist\//,
/build\//,
/\.min\./,
/package-lock\.json/,
/yarn\.lock/,
/\.generated\./,
];
function filterFiles(files) {
return files.filter(file => {
// Check extension
const ext = '.' + file.filename.split('.').pop();
if (!REVIEWABLE_EXTENSIONS.includes(ext)) {
return false;
}
// Check skip patterns
for (const pattern of SKIP_PATTERNS) {
if (pattern.test(file.filename)) {
return false;
}
}
// Skip deleted files (nothing to review)
if (file.status === 'removed') {
return false;
}
return true;
});
}
module.exports = {
handlePullRequest,
fetchPRDiff,
fetchPRFiles,
filterFiles,
octokit,
};
Building the Diff Parser
GitHub returns diffs in unified diff format. We need to parse this into structured data the LLM can understand. Create src/diff-parser.js:
// src/diff-parser.js
/**
* Parse a unified diff string into structured file objects.
*
* Each file object contains:
* - filename: the path of the changed file
* - status: 'added', 'modified', or 'removed'
* - hunks: array of change hunks, each with:
* - header: the @@ line
* - startLine: starting line number in the new file
* - lines: array of { type, lineNumber, content }
*/
function parseDiff(diffString) {
const files = [];
const fileSections = diffString.split(/^diff --git /m).filter(Boolean);
for (const section of fileSections) {
const file = parseFileSection(section);
if (file) {
files.push(file);
}
}
return files;
}
function parseFileSection(section) {
const lines = section.split('\n');
// Extract filename from the diff header
const headerMatch = lines[0].match(/a\/(.*?) b\/(.*)/);
if (!headerMatch) return null;
const filename = headerMatch[2];
// Determine file status
let status = 'modified';
if (section.includes('new file mode')) {
status = 'added';
} else if (section.includes('deleted file mode')) {
status = 'removed';
}
// Parse hunks
const hunks = [];
let currentHunk = null;
let newLineNumber = 0;
let diffLineIndex = 0;
for (const line of lines) {
diffLineIndex++;
// Detect hunk header: @@ -oldStart,oldCount +newStart,newCount @@
const hunkMatch = line.match(
/^@@ -(\d+)(?:,\d+)? \+(\d+)(?:,\d+)? @@(.*)/
);
if (hunkMatch) {
currentHunk = {
header: line,
startLine: parseInt(hunkMatch[2], 10),
context: hunkMatch[3].trim(),
lines: [],
};
hunks.push(currentHunk);
newLineNumber = parseInt(hunkMatch[2], 10);
continue;
}
if (!currentHunk) continue;
// Parse diff lines
if (line.startsWith('+')) {
currentHunk.lines.push({
type: 'added',
lineNumber: newLineNumber,
content: line.substring(1),
diffPosition: diffLineIndex,
});
newLineNumber++;
} else if (line.startsWith('-')) {
currentHunk.lines.push({
type: 'removed',
lineNumber: null,
content: line.substring(1),
diffPosition: diffLineIndex,
});
} else if (line.startsWith(' ')) {
currentHunk.lines.push({
type: 'context',
lineNumber: newLineNumber,
content: line.substring(1),
diffPosition: diffLineIndex,
});
newLineNumber++;
}
}
return { filename, status, hunks };
}
/**
* Convert parsed file hunks back into a readable diff string
* suitable for sending to an LLM.
*/
function hunksToString(file) {
let result = `File: ${file.filename} (${file.status})\n`;
for (const hunk of file.hunks) {
result += `\n${hunk.header}\n`;
for (const line of hunk.lines) {
const prefix =
line.type === 'added' ? '+' :
line.type === 'removed' ? '-' : ' ';
const lineNum = line.lineNumber ? `L${line.lineNumber}: ` : ' ';
result += `${prefix} ${lineNum}${line.content}\n`;
}
}
return result;
}
module.exports = { parseDiff, hunksToString };
Testing the Diff Parser
Let us write a quick test to verify our parser works correctly. Create test-parser.js in the project root:
// test-parser.js
const { parseDiff, hunksToString } = require('./src/diff-parser');
const sampleDiff = `diff --git a/src/utils.js b/src/utils.js
index abc1234..def5678 100644
--- a/src/utils.js
+++ b/src/utils.js
@@ -10,6 +10,8 @@ function calculateTotal(items) {
let total = 0;
for (const item of items) {
total += item.price;
+ // BUG: no null check on item.price
+ if (item.discount) total -= item.discount;
}
return total;
}
@@ -25,3 +27,7 @@ function formatCurrency(amount) {
return '$' + amount.toFixed(2);
}
+
+function validateEmail(email) {
+ return email.includes('@');
+}
`;
const files = parseDiff(sampleDiff);
console.log('Parsed files:', files.length);
for (const file of files) {
console.log(`\nFile: ${file.filename} (${file.status})`);
console.log(`Hunks: ${file.hunks.length}`);
console.log(hunksToString(file));
}
Run it:
node test-parser.js
You should see the parsed output showing two hunks with properly identified added, removed, and context lines, each with correct line numbers.
Exposing Your Local Server with ngrok
During development, GitHub needs to reach your local server. Use ngrok to create a public tunnel:
# Install ngrok (if you have not already)
npm install -g ngrok
# Start your server
node src/server.js
# In another terminal, create a tunnel
ngrok http 3000
ngrok will give you a public URL like https://abc123.ngrok-free.app. Use this as your webhook URL in the next step.
Configuring the GitHub Webhook
Now connect GitHub to your server:
- Go to your test repository on GitHub
- Navigate to Settings → Webhooks → Add webhook
- Set the Payload URL to
https://your-ngrok-url/webhook - Set Content type to
application/json - Set Secret to the same value as
GITHUB_WEBHOOK_SECRETin your.env - Under "Which events would you like to trigger this webhook?", select Let me select individual events and check only Pull requests
- Click Add webhook
Complete Webhook Flow
Here is the full flow we have built in this lesson:
1. Developer opens/updates a PR on GitHub
2. GitHub sends POST /webhook with PR metadata
3. Our server verifies the signature
4. We respond 202 immediately (avoid timeout)
5. We fetch the full diff via GitHub API
6. We parse the diff into structured file objects
7. We filter out non-reviewable files
8. Ready for LLM analysis (next lesson)
What Is Next
The webhook pipeline is ready and the diff parser is working. In the next lesson, we will build Step 2: Code Analysis with LLM — designing prompts that turn an LLM into an effective code reviewer, handling large diffs, and parsing structured output.
Lilly Tech Systems