freealberta/freealberta-food/app/scrapers/pdf-parser.js

const axios = require('axios');
const pdfParse = require('pdf-parse');
const db = require('../models/db');
const logger = require('../utils/logger');

const PDF_SOURCES = [
  {
    url: 'https://www.edmontonsfoodbank.com/documents/293/2025_April_Free_Community_Meals_4cRPMU5.pdf',
    name: 'Edmonton Food Bank - Community Meals',
    city: 'Edmonton'
  }
];

async function downloadPDF(url) {
  try {
    const response = await axios.get(url, {
      responseType: 'arraybuffer',
      headers: {
        'User-Agent': 'FreeAlbertaFoodBot/1.0'
      },
      timeout: 60000
    });
    return Buffer.from(response.data);
  } catch (error) {
    logger.error('Failed to download PDF', { url, error: error.message });
    return null;
  }
}

function parseEdmontonMealsPDF(text) {
  /*
   * Parse the Edmonton Food Bank community meals PDF.
   * The format typically lists:
   * - Location name
   * - Address
   * - Days/Times
   * - Meal type
   *
   * This parser attempts to extract structured data from the text.
   */

  const resources = [];
  const lines = text.split('\n').map(l => l.trim()).filter(l => l);

  // Common patterns in the PDF
  const dayPatterns = /monday|tuesday|wednesday|thursday|friday|saturday|sunday/i;
  const timePattern = /\d{1,2}:\d{2}\s*(am|pm|AM|PM)?/;
  const addressPattern = /\d+\s+[\w\s]+(?:Street|St|Avenue|Ave|Road|Rd|Drive|Dr|Boulevard|Blvd)/i;

  let currentResource = null;

  for (let i = 0; i < lines.length; i++) {
    const line = lines[i];

    // Skip header/footer lines
    if (line.includes('Free Community Meals') ||
        line.includes('Edmonton') && line.includes('Food Bank') ||
        line.match(/page\s+\d+/i)) {
      continue;
    }

    // Try to identify a new resource entry
    // Usually starts with the location name (no numbers at start)
    if (!line.match(/^\d/) &&
        !line.match(dayPatterns) &&
        !line.match(timePattern) &&
        line.length > 5 &&
        line.length < 100) {

      // Save previous resource if exists
      if (currentResource && currentResource.name) {
        resources.push(currentResource);
      }

      currentResource = {
        name: line,
        city: 'Edmonton',
        source: 'edmonton_foodbank_pdf'
      };
      continue;
    }

    // Try to extract address
    if (currentResource && line.match(addressPattern)) {
      currentResource.address = line;
      continue;
    }

    // Try to extract days/times
    if (currentResource && (line.match(dayPatterns) || line.match(timePattern))) {
      currentResource.hours_of_operation = currentResource.hours_of_operation
        ? `${currentResource.hours_of_operation}; ${line}`
        : line;
    }
  }

  // Don't forget the last resource
  if (currentResource && currentResource.name) {
    resources.push(currentResource);
  }

  return resources;
}

async function saveResource(resource, sourceUrl) {
  const sourceId = `pdf-${resource.name}-${resource.city}`.replace(/\s+/g, '-').toLowerCase();

  try {
    const result = await db.query(`
      INSERT INTO food_resources (
        name, description, resource_type,
        address, city, phone, hours_of_operation,
        source, source_url, source_id,
        updated_at, last_verified_at
      ) VALUES ($1, $2, 'community_meal', $3, $4, $5, $6, 'edmonton_foodbank_pdf', $7, $8, NOW(), NOW())
      ON CONFLICT (source, source_id)
      DO UPDATE SET
        name = EXCLUDED.name,
        address = COALESCE(EXCLUDED.address, food_resources.address),
        hours_of_operation = COALESCE(EXCLUDED.hours_of_operation, food_resources.hours_of_operation),
        updated_at = NOW()
      RETURNING (xmax = 0) AS inserted
    `, [
      resource.name,
      resource.description || 'Free community meal',
      resource.address || null,
      resource.city,
      resource.phone || null,
      resource.hours_of_operation || null,
      sourceUrl,
      sourceId
    ]);

    return result.rows[0];
  } catch (error) {
    logger.error('Failed to save PDF resource', { name: resource.name, error: error.message });
    return null;
  }
}

async function parsePDFs() {
  logger.info('Starting PDF parsing');

  const logResult = await db.query(`
    INSERT INTO scrape_logs (source, status)
    VALUES ('edmonton_foodbank_pdf', 'running')
    RETURNING id
  `);
  const logId = logResult.rows[0].id;

  let totalFound = 0;
  let totalAdded = 0;
  let totalUpdated = 0;

  try {
    for (const pdfSource of PDF_SOURCES) {
      logger.info(`Processing PDF: ${pdfSource.name}`);

      const pdfBuffer = await downloadPDF(pdfSource.url);
      if (!pdfBuffer) {
        logger.warn(`Skipping ${pdfSource.name} - download failed`);
        continue;
      }

      try {
        const data = await pdfParse(pdfBuffer);
        logger.info(`Extracted ${data.text.length} characters from PDF`);

        const resources = parseEdmontonMealsPDF(data.text);
        logger.info(`Parsed ${resources.length} resources from PDF`);

        totalFound += resources.length;

        for (const resource of resources) {
          const saved = await saveResource(resource, pdfSource.url);
          if (saved) {
            if (saved.inserted) {
              totalAdded++;
            } else {
              totalUpdated++;
            }
          }
        }
      } catch (parseError) {
        logger.error('PDF parsing failed', { error: parseError.message });
      }
    }

    await db.query(`
      UPDATE scrape_logs
      SET completed_at = NOW(),
          status = 'completed',
          records_found = $1,
          records_added = $2,
          records_updated = $3
      WHERE id = $4
    `, [totalFound, totalAdded, totalUpdated, logId]);

    logger.info('PDF parsing completed', {
      found: totalFound,
      added: totalAdded,
      updated: totalUpdated
    });

    return { found: totalFound, added: totalAdded, updated: totalUpdated };

  } catch (error) {
    await db.query(`
      UPDATE scrape_logs
      SET completed_at = NOW(),
          status = 'failed',
          error_message = $1
      WHERE id = $2
    `, [error.message, logId]);

    throw error;
  }
}

// Run if called directly
if (require.main === module) {
  parsePDFs()
    .then(result => {
      console.log('PDF parsing completed:', result);
      process.exit(0);
    })
    .catch(err => {
      console.error('PDF parsing failed:', err);
      process.exit(1);
    });
}

module.exports = { parsePDFs };