234 lines
6.3 KiB
JavaScript

const axios = require('axios');
const pdfParse = require('pdf-parse');
const db = require('../models/db');
const logger = require('../utils/logger');
const PDF_SOURCES = [
{
url: 'https://www.edmontonsfoodbank.com/documents/293/2025_April_Free_Community_Meals_4cRPMU5.pdf',
name: 'Edmonton Food Bank - Community Meals',
city: 'Edmonton'
}
];
async function downloadPDF(url) {
try {
const response = await axios.get(url, {
responseType: 'arraybuffer',
headers: {
'User-Agent': 'FreeAlbertaFoodBot/1.0'
},
timeout: 60000
});
return Buffer.from(response.data);
} catch (error) {
logger.error('Failed to download PDF', { url, error: error.message });
return null;
}
}
function parseEdmontonMealsPDF(text) {
/*
* Parse the Edmonton Food Bank community meals PDF.
* The format typically lists:
* - Location name
* - Address
* - Days/Times
* - Meal type
*
* This parser attempts to extract structured data from the text.
*/
const resources = [];
const lines = text.split('\n').map(l => l.trim()).filter(l => l);
// Common patterns in the PDF
const dayPatterns = /monday|tuesday|wednesday|thursday|friday|saturday|sunday/i;
const timePattern = /\d{1,2}:\d{2}\s*(am|pm|AM|PM)?/;
const addressPattern = /\d+\s+[\w\s]+(?:Street|St|Avenue|Ave|Road|Rd|Drive|Dr|Boulevard|Blvd)/i;
let currentResource = null;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
// Skip header/footer lines
if (line.includes('Free Community Meals') ||
line.includes('Edmonton') && line.includes('Food Bank') ||
line.match(/page\s+\d+/i)) {
continue;
}
// Try to identify a new resource entry
// Usually starts with the location name (no numbers at start)
if (!line.match(/^\d/) &&
!line.match(dayPatterns) &&
!line.match(timePattern) &&
line.length > 5 &&
line.length < 100) {
// Save previous resource if exists
if (currentResource && currentResource.name) {
resources.push(currentResource);
}
currentResource = {
name: line,
city: 'Edmonton',
source: 'edmonton_foodbank_pdf'
};
continue;
}
// Try to extract address
if (currentResource && line.match(addressPattern)) {
currentResource.address = line;
continue;
}
// Try to extract days/times
if (currentResource && (line.match(dayPatterns) || line.match(timePattern))) {
currentResource.hours_of_operation = currentResource.hours_of_operation
? `${currentResource.hours_of_operation}; ${line}`
: line;
}
}
// Don't forget the last resource
if (currentResource && currentResource.name) {
resources.push(currentResource);
}
return resources;
}
async function saveResource(resource, sourceUrl) {
const sourceId = `pdf-${resource.name}-${resource.city}`.replace(/\s+/g, '-').toLowerCase();
try {
const result = await db.query(`
INSERT INTO food_resources (
name, description, resource_type,
address, city, phone, hours_of_operation,
source, source_url, source_id,
updated_at, last_verified_at
) VALUES ($1, $2, 'community_meal', $3, $4, $5, $6, 'edmonton_foodbank_pdf', $7, $8, NOW(), NOW())
ON CONFLICT (source, source_id)
DO UPDATE SET
name = EXCLUDED.name,
address = COALESCE(EXCLUDED.address, food_resources.address),
hours_of_operation = COALESCE(EXCLUDED.hours_of_operation, food_resources.hours_of_operation),
updated_at = NOW()
RETURNING (xmax = 0) AS inserted
`, [
resource.name,
resource.description || 'Free community meal',
resource.address || null,
resource.city,
resource.phone || null,
resource.hours_of_operation || null,
sourceUrl,
sourceId
]);
return result.rows[0];
} catch (error) {
logger.error('Failed to save PDF resource', { name: resource.name, error: error.message });
return null;
}
}
async function parsePDFs() {
logger.info('Starting PDF parsing');
const logResult = await db.query(`
INSERT INTO scrape_logs (source, status)
VALUES ('edmonton_foodbank_pdf', 'running')
RETURNING id
`);
const logId = logResult.rows[0].id;
let totalFound = 0;
let totalAdded = 0;
let totalUpdated = 0;
try {
for (const pdfSource of PDF_SOURCES) {
logger.info(`Processing PDF: ${pdfSource.name}`);
const pdfBuffer = await downloadPDF(pdfSource.url);
if (!pdfBuffer) {
logger.warn(`Skipping ${pdfSource.name} - download failed`);
continue;
}
try {
const data = await pdfParse(pdfBuffer);
logger.info(`Extracted ${data.text.length} characters from PDF`);
const resources = parseEdmontonMealsPDF(data.text);
logger.info(`Parsed ${resources.length} resources from PDF`);
totalFound += resources.length;
for (const resource of resources) {
const saved = await saveResource(resource, pdfSource.url);
if (saved) {
if (saved.inserted) {
totalAdded++;
} else {
totalUpdated++;
}
}
}
} catch (parseError) {
logger.error('PDF parsing failed', { error: parseError.message });
}
}
await db.query(`
UPDATE scrape_logs
SET completed_at = NOW(),
status = 'completed',
records_found = $1,
records_added = $2,
records_updated = $3
WHERE id = $4
`, [totalFound, totalAdded, totalUpdated, logId]);
logger.info('PDF parsing completed', {
found: totalFound,
added: totalAdded,
updated: totalUpdated
});
return { found: totalFound, added: totalAdded, updated: totalUpdated };
} catch (error) {
await db.query(`
UPDATE scrape_logs
SET completed_at = NOW(),
status = 'failed',
error_message = $1
WHERE id = $2
`, [error.message, logId]);
throw error;
}
}
// Run if called directly
if (require.main === module) {
parsePDFs()
.then(result => {
console.log('PDF parsing completed:', result);
process.exit(0);
})
.catch(err => {
console.error('PDF parsing failed:', err);
process.exit(1);
});
}
module.exports = { parsePDFs };