packages/wikipedia-diff-stream/main.js (93 lines of code) (raw):

#!/usr/bin/env node // Copyright 2018 Twitter, Inc. // Licensed under the Apache License, Version 2.0 // http://www.apache.org/licenses/LICENSE-2.0 var EventSource = require("eventsource"); var request = require("request-promise-native"); var WtfWikipedia = require("wtf_wikipedia"); var diffMatchPatch = require("diff-match-patch-node")(); var DIFF_TIMEOUT = 60 * 1000; // 60s timeout function diffAdded(r1, r2) { const rv = diffMatchPatch.diff_main( WtfWikipedia(r1).plaintext(), WtfWikipedia(r2).plaintext(), false, // no line-by-line optimization DIFF_TIMEOUT ); return rv .filter(([direction, change]) => { return direction === 1; }) .map(([direction, change]) => { return change.trim(); }) .join("\n"); } var url = "https://stream.wikimedia.org/v2/stream/recentchange"; function usage() { console.error("Usage: wikipedia-json <wiki domain>"); console.error("Example: wikipedia-json en.wikipedia.org"); process.exit(1); } function main() { if (process.argv.length !== 3) { usage(); } const domain = process.argv[2]; if (!/^[a-z]+\.wikipedia\.org$/.exec(domain)) { usage(); } startStream(domain); } async function fetchChangeContent(domain, revids, callback) { const body = await request.get( `https://${domain}/w/api.php?action=query&prop=revisions&rvslots=*&rvprop=content&format=json`, { json: true, qs: { revids: revids.join("|") } } ); const [page] = Object.values(body.query.pages); return page.revisions.map(rev => rev.slots.main["*"]); } async function processEvent(domain, event) { const json = event.data; const data = JSON.parse(json); if (data.meta && data.meta.domain === domain && data.revision) { if (data.title.includes(":")) { // Ignore any changes to special pages (this is course but we're only sampling) return; } const [r1, r2] = await fetchChangeContent(domain, [ data.revision.old, data.revision.new ]); const added = diffAdded(r1, r2); if (!added) { return; } console.log( JSON.stringify( Object.assign({}, data, { content: { added } }) ) ); } } function startStream(domain) { console.error(`Connecting to EventStreams at ${url}`); var eventSource = new EventSource(url); eventSource.onopen = function(event) { console.error("-- Opened connection."); }; eventSource.onerror = function(err) { console.error("-- Stream error: " + err.toString()); }; eventSource.onmessage = msg => processEvent(domain, msg).catch(err => { console.error("-- Processing error: " + err.toString()); console.error(err.stack); }); } main();