diff --git a/data/git-timestamps.json b/data/git-timestamps.json index 0dfc2dbc9..fc854599b 100644 --- a/data/git-timestamps.json +++ b/data/git-timestamps.json @@ -2,6 +2,7 @@ "src/content/about.md": "2026-01-10T13:09:46.000Z", "src/content/blog/antibot-at-scale.md": "2026-01-19T11:56:52.000Z", "src/content/blog/antibot-detection-at-scale.md": "2026-01-19T14:33:17.000Z", + "src/content/blog/automate-open-graph-audit.md": "2026-01-23T16:10:05.000Z", "src/content/blog/browser-automation.md": "2026-01-10T15:22:34.000Z", "src/content/blog/compress.md": "2026-01-10T15:22:34.000Z", "src/content/blog/custom-rules.md": "2026-01-10T15:22:34.000Z", @@ -432,7 +433,7 @@ "src/pages/styleguide.md": "2024-07-20T16:01:45.000Z", "src/pages/subprocessors.md": "2025-07-01T19:16:55.000Z", "src/pages/terms.js": "2018-08-20T15:55:21.000Z", - "src/pages/tools/sharing-debugger.js": "2026-01-21T18:20:39.000Z", + "src/pages/tools/sharing-debugger.js": "2026-01-22T13:37:15.000Z", "src/pages/tos.js": "2018-10-08T20:58:37.000Z", "src/pages/tos.md": "2022-11-27T09:53:11.000Z", "src/pages/user-agents.js": "2026-01-21T12:02:09.000Z" diff --git a/src/content/blog/automate-open-graph-audit.md b/src/content/blog/automate-open-graph-audit.md new file mode 100644 index 000000000..419a505d8 --- /dev/null +++ b/src/content/blog/automate-open-graph-audit.md @@ -0,0 +1,276 @@ +--- +title: 'Automate Open Graph Audit' +subtitle: 'Stop Shipping Broken Link Previews' +description: 'Learn how to build a Node.js script that automatically validates Open Graph tags across your entire sitemap.' +date: '2026-01-23' +--- + +A naked URL without an Open Graph image and proper metadata is a **conversion leak**. It looks unprofessional, and it gets scrolled past. Additionally, SEO can be affected since Google values those who maintain their pages better. + +We built a [Sharing Debugger Tool](/tools/sharing-debugger) for exactly this: paste in any URL and instantly see how your metadata looks across different platforms. It's perfect for spot-checking individual pages. + +The problem is **scale**. You can't manually audit a sitemap with 5,000 pages using a browser extension. You need infrastructure that scales with your deployment. + +### The cost of broken links + +* **Visually Dominant:** Rich previews occupy 400% more pixels in a feed than plain text. +* **Developer Trust:** If your meta tags are broken, maybe your API too. +* **CTR is King:** You can rank #1 on Google, but if your social sharing is broken, your viral coefficient is zero. + +For organizations with thousands of pages, content updates happen daily. When multiple employees have the ability to modify pages, you need a robust solution that can recurrently analyze all your sitemaps to catch regressions. + +In this post you'll learn to use Microlink to automate your own scans to maintain quality. We're going to look at a simplified example that gives you the foundation to build on. + +### The simple stack + +I wanted this to be lightweight and practical, not some enterprise monstrosity. Three dependencies, that's it: + +* [**sitemapper**](https://www.npmjs.com/package/sitemapper): Grabs every URL from your sitemap (even handles nested sitemap indexes). +* [**microlink/mql**](/docs/mql/getting-started/installation): Fetches metadata exactly like social networks see it. +* [**p-map**](https://www.npmjs.com/package/p-map): Manages concurrency so you don't melt the free tier API. + +### Getting started + +Five minutes of setup: + +```shell +mkdir sitemap-validator +cd sitemap-validator +npm init -y +npm install sitemapper @microlink/mql p-map --save +``` + +### The script that does the heavy lifting + +Create **audit.js** file and drop this in: + +```javascript +import Sitemapper from 'sitemapper'; +import mql from '@microlink/mql'; +import pMap from 'p-map'; +import fs from 'fs' + +// CONFIGURATION +const SITEMAP_URL = process.env.SITEMAP_URL || 'https://YOUR_WEB_PAGE.com/sitemap.xml'; +const API_KEY = process.env.API_KEY; // Optional, but recommended to large sitemaps + +const CONCURRENCY = 1; // Keep it low for free tier +const FREE_TIER_LIMIT = 50; // 50 requests per day for free tier +const FIRST_BATCH = 0; // If the free tier gets small make batch by batch + +const validateUrl = async (url) => { + await new Promise(resolve => setTimeout(resolve, 1000)); // Avoid free tier limits + try { + // We request 'meta' data specifically. + // Microlink mimics a browser to extract OG tags. + const { status, data, response } = await mql(url, { + meta: true, + apiKey: API_KEY + }); + + if (status !== 'success') { + return { url, error: 'API Error', status }; + } + + const errors = []; + + // Validation logic + + if (!data.image || !data.image.url) errors.push('Missing OG Image'); + + if (!data.title) errors.push('Missing OG Title'); + else if (data.title > 60) errors.push('OG Title is too long') + + if (!data.description) errors.push('Missing Description'); + else if (data.description.length < 50) errors.push('Description too short'); + + if (!data.author) errors.push('Missing Author'); + + if (!data.date) errors.push('Missing Date'); + else if (isNaN(new Date(data.date).getTime())) errors.push('Invalid Date'); + + if (!data.logo) errors.push('Missing Logo'); + + return { valid: !errors.length, url, errors }; + + } catch (err) { + return { url, error: err.message, valid: false }; + } +}; + +const runAudit = async () => { + console.log(`πΊοΈ Fetching sitemap: ${SITEMAP_URL}...`); + + const sitemap = new Sitemapper({ url: SITEMAP_URL, timeout: 15000 }); + + let { sites } = await sitemap.fetch(); + if (sites.length > FREE_TIER_LIMIT && !API_KEY) { + console.log(`Total URLs exceeds free tier limit. Cutting from ${sites.length} to ${FREE_TIER_LIMIT}` ); + sites = sites.slice(FREE_TIER_LIMIT * FIRST_BATCH, FREE_TIER_LIMIT); + } + + const totalUrls = sites.length; + let processedCount = 0; + + console.log(`Found ${totalUrls} URLs. Starting validation...`); + + // Use p-map to control concurrency with progress tracking + const results = await pMap(sites, async (url) => { + const result = await validateUrl(url); + processedCount++; + const percent = ((processedCount / totalUrls) * 100).toFixed(1); + process.stdout.write(`\rβ³ Progress: ${processedCount}/${totalUrls} (${percent}%)`); + return result; + }, { concurrency: CONCURRENCY }); + + // Reporting + const failures = results.filter(r => !r.valid); + + console.log('\n\n--- π AUDIT REPORT ---'); + console.log(`Total Scanned: ${results.length}`); + console.log(`Passed: ${results.length - failures.length}`); + console.log(`Failed: ${failures.length}`); + + if (failures.length > 0) { + const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); + const errorFilePath = `./errors-${timestamp}.txt`; + + let errorContent = 'β FAILED URLS:\n'; + failures.forEach(f => { + errorContent += `\n${f.url}\n`; + if (f.errors) f.errors.forEach(e => errorContent += ` - ${e}\n`); + if (f.error) errorContent += ` - System Error: ${f.error}\n`; + }); + + fs.writeFileSync(errorFilePath, errorContent); + console.log(`\nβ Errors saved to: ${errorFilePath}`); + } else { + console.log('\nβ All systems nominal. Your sitemap is perfect.'); + } +}; + +runAudit(); +``` + +### Running your first audit + +Just fire it up: + +```bash +node audit.js +``` + +**A quick heads-up on rate limits:** If you're on the free plan, keep CONCURRENCY at 1. You'll avoid those annoying 429 errors. With a [Pro plan](/#pricing), you can crank it to 10 or 20 and blast through thousands of pages in minutes. + +### What Makes This Actually Work + +When you call ```mql(url, { meta: true })```, we're not just parsing HTML. We spin up a [real headless Chrome browser](/blog/what-is-a-headless-browser). + +**Why does this matter?** + +Your React/Vue/Angular site renders properly. Even if you're doing client-side rendering we execute the JavaScript and grab the tags after they're populated. + +### Level up, semantic SEO with AI + +Since you are already fetching the page metadata, why not validate the quality of the content? + +Once you have it running, you can add extra features as elaborate as you want. You could take the title and description and ask an LLM that, by analyzing the website content, checks and improves both the title and description if it considers appropriate to climb positions in Google's ranking. + +```javascript +const mql = require('@microlink/mql') + +// ... prev code +const { status, data, response } = mql(url, { + meta: true, + data: { + content: { + selector: 'body' // get the body of the page + type: 'text' // get combined text content + } + } +}) + +console.log(`The content of the url -> ${data.content}`) +// next code ... +``` + +You can extend and adapt to your use case or your clients' and offer a service that makes a difference. + +```bash +You are an expert Technical SEO Auditor and Content Analyst. Your goal is to evaluate the semantic coherence between a webpage's metadata and its actual body content, adhering to Google's latest search documentation and best practices. + +**INPUT DATA:** +1. `current_title`: The content of the