234 lines
6.3 KiB
JavaScript
234 lines
6.3 KiB
JavaScript
const axios = require('axios');
|
|
const pdfParse = require('pdf-parse');
|
|
const db = require('../models/db');
|
|
const logger = require('../utils/logger');
|
|
|
|
const PDF_SOURCES = [
|
|
{
|
|
url: 'https://www.edmontonsfoodbank.com/documents/293/2025_April_Free_Community_Meals_4cRPMU5.pdf',
|
|
name: 'Edmonton Food Bank - Community Meals',
|
|
city: 'Edmonton'
|
|
}
|
|
];
|
|
|
|
async function downloadPDF(url) {
|
|
try {
|
|
const response = await axios.get(url, {
|
|
responseType: 'arraybuffer',
|
|
headers: {
|
|
'User-Agent': 'FreeAlbertaFoodBot/1.0'
|
|
},
|
|
timeout: 60000
|
|
});
|
|
return Buffer.from(response.data);
|
|
} catch (error) {
|
|
logger.error('Failed to download PDF', { url, error: error.message });
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function parseEdmontonMealsPDF(text) {
|
|
/*
|
|
* Parse the Edmonton Food Bank community meals PDF.
|
|
* The format typically lists:
|
|
* - Location name
|
|
* - Address
|
|
* - Days/Times
|
|
* - Meal type
|
|
*
|
|
* This parser attempts to extract structured data from the text.
|
|
*/
|
|
|
|
const resources = [];
|
|
const lines = text.split('\n').map(l => l.trim()).filter(l => l);
|
|
|
|
// Common patterns in the PDF
|
|
const dayPatterns = /monday|tuesday|wednesday|thursday|friday|saturday|sunday/i;
|
|
const timePattern = /\d{1,2}:\d{2}\s*(am|pm|AM|PM)?/;
|
|
const addressPattern = /\d+\s+[\w\s]+(?:Street|St|Avenue|Ave|Road|Rd|Drive|Dr|Boulevard|Blvd)/i;
|
|
|
|
let currentResource = null;
|
|
|
|
for (let i = 0; i < lines.length; i++) {
|
|
const line = lines[i];
|
|
|
|
// Skip header/footer lines
|
|
if (line.includes('Free Community Meals') ||
|
|
line.includes('Edmonton') && line.includes('Food Bank') ||
|
|
line.match(/page\s+\d+/i)) {
|
|
continue;
|
|
}
|
|
|
|
// Try to identify a new resource entry
|
|
// Usually starts with the location name (no numbers at start)
|
|
if (!line.match(/^\d/) &&
|
|
!line.match(dayPatterns) &&
|
|
!line.match(timePattern) &&
|
|
line.length > 5 &&
|
|
line.length < 100) {
|
|
|
|
// Save previous resource if exists
|
|
if (currentResource && currentResource.name) {
|
|
resources.push(currentResource);
|
|
}
|
|
|
|
currentResource = {
|
|
name: line,
|
|
city: 'Edmonton',
|
|
source: 'edmonton_foodbank_pdf'
|
|
};
|
|
continue;
|
|
}
|
|
|
|
// Try to extract address
|
|
if (currentResource && line.match(addressPattern)) {
|
|
currentResource.address = line;
|
|
continue;
|
|
}
|
|
|
|
// Try to extract days/times
|
|
if (currentResource && (line.match(dayPatterns) || line.match(timePattern))) {
|
|
currentResource.hours_of_operation = currentResource.hours_of_operation
|
|
? `${currentResource.hours_of_operation}; ${line}`
|
|
: line;
|
|
}
|
|
}
|
|
|
|
// Don't forget the last resource
|
|
if (currentResource && currentResource.name) {
|
|
resources.push(currentResource);
|
|
}
|
|
|
|
return resources;
|
|
}
|
|
|
|
async function saveResource(resource, sourceUrl) {
|
|
const sourceId = `pdf-${resource.name}-${resource.city}`.replace(/\s+/g, '-').toLowerCase();
|
|
|
|
try {
|
|
const result = await db.query(`
|
|
INSERT INTO food_resources (
|
|
name, description, resource_type,
|
|
address, city, phone, hours_of_operation,
|
|
source, source_url, source_id,
|
|
updated_at, last_verified_at
|
|
) VALUES ($1, $2, 'community_meal', $3, $4, $5, $6, 'edmonton_foodbank_pdf', $7, $8, NOW(), NOW())
|
|
ON CONFLICT (source, source_id)
|
|
DO UPDATE SET
|
|
name = EXCLUDED.name,
|
|
address = COALESCE(EXCLUDED.address, food_resources.address),
|
|
hours_of_operation = COALESCE(EXCLUDED.hours_of_operation, food_resources.hours_of_operation),
|
|
updated_at = NOW()
|
|
RETURNING (xmax = 0) AS inserted
|
|
`, [
|
|
resource.name,
|
|
resource.description || 'Free community meal',
|
|
resource.address || null,
|
|
resource.city,
|
|
resource.phone || null,
|
|
resource.hours_of_operation || null,
|
|
sourceUrl,
|
|
sourceId
|
|
]);
|
|
|
|
return result.rows[0];
|
|
} catch (error) {
|
|
logger.error('Failed to save PDF resource', { name: resource.name, error: error.message });
|
|
return null;
|
|
}
|
|
}
|
|
|
|
async function parsePDFs() {
|
|
logger.info('Starting PDF parsing');
|
|
|
|
const logResult = await db.query(`
|
|
INSERT INTO scrape_logs (source, status)
|
|
VALUES ('edmonton_foodbank_pdf', 'running')
|
|
RETURNING id
|
|
`);
|
|
const logId = logResult.rows[0].id;
|
|
|
|
let totalFound = 0;
|
|
let totalAdded = 0;
|
|
let totalUpdated = 0;
|
|
|
|
try {
|
|
for (const pdfSource of PDF_SOURCES) {
|
|
logger.info(`Processing PDF: ${pdfSource.name}`);
|
|
|
|
const pdfBuffer = await downloadPDF(pdfSource.url);
|
|
if (!pdfBuffer) {
|
|
logger.warn(`Skipping ${pdfSource.name} - download failed`);
|
|
continue;
|
|
}
|
|
|
|
try {
|
|
const data = await pdfParse(pdfBuffer);
|
|
logger.info(`Extracted ${data.text.length} characters from PDF`);
|
|
|
|
const resources = parseEdmontonMealsPDF(data.text);
|
|
logger.info(`Parsed ${resources.length} resources from PDF`);
|
|
|
|
totalFound += resources.length;
|
|
|
|
for (const resource of resources) {
|
|
const saved = await saveResource(resource, pdfSource.url);
|
|
if (saved) {
|
|
if (saved.inserted) {
|
|
totalAdded++;
|
|
} else {
|
|
totalUpdated++;
|
|
}
|
|
}
|
|
}
|
|
} catch (parseError) {
|
|
logger.error('PDF parsing failed', { error: parseError.message });
|
|
}
|
|
}
|
|
|
|
await db.query(`
|
|
UPDATE scrape_logs
|
|
SET completed_at = NOW(),
|
|
status = 'completed',
|
|
records_found = $1,
|
|
records_added = $2,
|
|
records_updated = $3
|
|
WHERE id = $4
|
|
`, [totalFound, totalAdded, totalUpdated, logId]);
|
|
|
|
logger.info('PDF parsing completed', {
|
|
found: totalFound,
|
|
added: totalAdded,
|
|
updated: totalUpdated
|
|
});
|
|
|
|
return { found: totalFound, added: totalAdded, updated: totalUpdated };
|
|
|
|
} catch (error) {
|
|
await db.query(`
|
|
UPDATE scrape_logs
|
|
SET completed_at = NOW(),
|
|
status = 'failed',
|
|
error_message = $1
|
|
WHERE id = $2
|
|
`, [error.message, logId]);
|
|
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
// Run if called directly
|
|
if (require.main === module) {
|
|
parsePDFs()
|
|
.then(result => {
|
|
console.log('PDF parsing completed:', result);
|
|
process.exit(0);
|
|
})
|
|
.catch(err => {
|
|
console.error('PDF parsing failed:', err);
|
|
process.exit(1);
|
|
});
|
|
}
|
|
|
|
module.exports = { parsePDFs };
|