const axios = require('axios'); const pdfParse = require('pdf-parse'); const db = require('../models/db'); const logger = require('../utils/logger'); const PDF_SOURCES = [ { url: 'https://www.edmontonsfoodbank.com/documents/293/2025_April_Free_Community_Meals_4cRPMU5.pdf', name: 'Edmonton Food Bank - Community Meals', city: 'Edmonton' } ]; async function downloadPDF(url) { try { const response = await axios.get(url, { responseType: 'arraybuffer', headers: { 'User-Agent': 'FreeAlbertaFoodBot/1.0' }, timeout: 60000 }); return Buffer.from(response.data); } catch (error) { logger.error('Failed to download PDF', { url, error: error.message }); return null; } } function parseEdmontonMealsPDF(text) { /* * Parse the Edmonton Food Bank community meals PDF. * The format typically lists: * - Location name * - Address * - Days/Times * - Meal type * * This parser attempts to extract structured data from the text. */ const resources = []; const lines = text.split('\n').map(l => l.trim()).filter(l => l); // Common patterns in the PDF const dayPatterns = /monday|tuesday|wednesday|thursday|friday|saturday|sunday/i; const timePattern = /\d{1,2}:\d{2}\s*(am|pm|AM|PM)?/; const addressPattern = /\d+\s+[\w\s]+(?:Street|St|Avenue|Ave|Road|Rd|Drive|Dr|Boulevard|Blvd)/i; let currentResource = null; for (let i = 0; i < lines.length; i++) { const line = lines[i]; // Skip header/footer lines if (line.includes('Free Community Meals') || line.includes('Edmonton') && line.includes('Food Bank') || line.match(/page\s+\d+/i)) { continue; } // Try to identify a new resource entry // Usually starts with the location name (no numbers at start) if (!line.match(/^\d/) && !line.match(dayPatterns) && !line.match(timePattern) && line.length > 5 && line.length < 100) { // Save previous resource if exists if (currentResource && currentResource.name) { resources.push(currentResource); } currentResource = { name: line, city: 'Edmonton', source: 'edmonton_foodbank_pdf' }; continue; } // Try to extract address if (currentResource && line.match(addressPattern)) { currentResource.address = line; continue; } // Try to extract days/times if (currentResource && (line.match(dayPatterns) || line.match(timePattern))) { currentResource.hours_of_operation = currentResource.hours_of_operation ? `${currentResource.hours_of_operation}; ${line}` : line; } } // Don't forget the last resource if (currentResource && currentResource.name) { resources.push(currentResource); } return resources; } async function saveResource(resource, sourceUrl) { const sourceId = `pdf-${resource.name}-${resource.city}`.replace(/\s+/g, '-').toLowerCase(); try { const result = await db.query(` INSERT INTO food_resources ( name, description, resource_type, address, city, phone, hours_of_operation, source, source_url, source_id, updated_at, last_verified_at ) VALUES ($1, $2, 'community_meal', $3, $4, $5, $6, 'edmonton_foodbank_pdf', $7, $8, NOW(), NOW()) ON CONFLICT (source, source_id) DO UPDATE SET name = EXCLUDED.name, address = COALESCE(EXCLUDED.address, food_resources.address), hours_of_operation = COALESCE(EXCLUDED.hours_of_operation, food_resources.hours_of_operation), updated_at = NOW() RETURNING (xmax = 0) AS inserted `, [ resource.name, resource.description || 'Free community meal', resource.address || null, resource.city, resource.phone || null, resource.hours_of_operation || null, sourceUrl, sourceId ]); return result.rows[0]; } catch (error) { logger.error('Failed to save PDF resource', { name: resource.name, error: error.message }); return null; } } async function parsePDFs() { logger.info('Starting PDF parsing'); const logResult = await db.query(` INSERT INTO scrape_logs (source, status) VALUES ('edmonton_foodbank_pdf', 'running') RETURNING id `); const logId = logResult.rows[0].id; let totalFound = 0; let totalAdded = 0; let totalUpdated = 0; try { for (const pdfSource of PDF_SOURCES) { logger.info(`Processing PDF: ${pdfSource.name}`); const pdfBuffer = await downloadPDF(pdfSource.url); if (!pdfBuffer) { logger.warn(`Skipping ${pdfSource.name} - download failed`); continue; } try { const data = await pdfParse(pdfBuffer); logger.info(`Extracted ${data.text.length} characters from PDF`); const resources = parseEdmontonMealsPDF(data.text); logger.info(`Parsed ${resources.length} resources from PDF`); totalFound += resources.length; for (const resource of resources) { const saved = await saveResource(resource, pdfSource.url); if (saved) { if (saved.inserted) { totalAdded++; } else { totalUpdated++; } } } } catch (parseError) { logger.error('PDF parsing failed', { error: parseError.message }); } } await db.query(` UPDATE scrape_logs SET completed_at = NOW(), status = 'completed', records_found = $1, records_added = $2, records_updated = $3 WHERE id = $4 `, [totalFound, totalAdded, totalUpdated, logId]); logger.info('PDF parsing completed', { found: totalFound, added: totalAdded, updated: totalUpdated }); return { found: totalFound, added: totalAdded, updated: totalUpdated }; } catch (error) { await db.query(` UPDATE scrape_logs SET completed_at = NOW(), status = 'failed', error_message = $1 WHERE id = $2 `, [error.message, logId]); throw error; } } // Run if called directly if (require.main === module) { parsePDFs() .then(result => { console.log('PDF parsing completed:', result); process.exit(0); }) .catch(err => { console.error('PDF parsing failed:', err); process.exit(1); }); } module.exports = { parsePDFs };