const axios = require('axios'); const cheerio = require('cheerio'); const db = require('../models/db'); const logger = require('../utils/logger'); const geocoding = require('../services/geocoding'); const BASE_URL = 'https://informalberta.ca/public/common'; const COMBO_LISTS = [ { id: '1004954', name: 'North Zone' }, { id: '1004953', name: 'Edmonton Zone' }, { id: '1004951', name: 'Calgary Zone' }, { id: '1004952', name: 'Central Zone' }, { id: '1004903', name: 'South Zone' } ]; // Rate limiting - be respectful to the server const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms)); async function fetchPage(url) { try { const response = await axios.get(url, { headers: { 'User-Agent': 'FreeAlbertaFoodBot/1.0 (contact@freealberta.org)', 'Accept': 'text/html,application/xhtml+xml' }, timeout: 30000 }); return response.data; } catch (error) { logger.error(`Failed to fetch ${url}`, { error: error.message }); return null; } } async function parseServicePage(serviceUrl) { const html = await fetchPage(serviceUrl); if (!html) return null; const $ = cheerio.load(html); const resource = {}; // Parse service profile page - structure may vary resource.name = $('h1').first().text().trim() || $('.service-name').text().trim() || $('title').text().split('|')[0].trim(); // Try multiple selectors for different page structures const addressBlock = $('.address, .location-address, [class*="address"]').first(); if (addressBlock.length) { resource.address = addressBlock.text().replace(/\s+/g, ' ').trim(); } // Look for phone numbers const phoneMatch = html.match(/(\d{3}[-.\s]?\d{3}[-.\s]?\d{4})/); if (phoneMatch) { resource.phone = phoneMatch[1]; } // Look for description/service info const descriptionSelectors = [ '.service-description', '.description', '#description', '.program-description', '[class*="description"]' ]; for (const selector of descriptionSelectors) { const desc = $(selector).first().text().trim(); if (desc && desc.length > 20) { resource.description = desc; break; } } // Hours of operation const hoursMatch = html.match(/hours?[:\s]*([\w\s\d:;,.-]+)/i); if (hoursMatch) { resource.hours_of_operation = hoursMatch[1].trim(); } // Email const emailMatch = html.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/); if (emailMatch) { resource.email = emailMatch[0]; } // Website const websiteMatch = html.match(/https?:\/\/[^\s"'<>]+/g); if (websiteMatch) { const externalSite = websiteMatch.find(url => !url.includes('informalberta.ca') && !url.includes('facebook.com') && url.length < 200 ); if (externalSite) { resource.website = externalSite; } } return resource; } async function parseSublist(sublistUrl, zone) { const html = await fetchPage(sublistUrl); if (!html) return []; const $ = cheerio.load(html); const resources = []; // Find all service links $('a[href*="serviceProfileStyled.do"], a[href*="serviceQueryId"]').each((_, el) => { const href = $(el).attr('href'); const name = $(el).text().trim(); if (href && name) { resources.push({ name, url: href.startsWith('http') ? href : `${BASE_URL}/${href.replace('../', '')}`, zone }); } }); // Also look for direct resource info in list format $('.service-item, .resource-item, [class*="service"]').each((_, el) => { const name = $(el).find('.name, .title, h3, h4').first().text().trim(); const address = $(el).find('.address, .location').first().text().trim(); const phone = $(el).find('.phone, .tel').first().text().trim(); if (name) { resources.push({ name, address, phone, zone, fromList: true }); } }); return resources; } async function parseComboList(comboListId, zoneName) { const url = `${BASE_URL}/viewComboList.do?comboListId=${comboListId}`; const html = await fetchPage(url); if (!html) return []; const $ = cheerio.load(html); const sublists = []; // Find all sublist links $('a[href*="viewSublist.do"], a[href*="cartId"]').each((_, el) => { const href = $(el).attr('href'); const areaName = $(el).text().trim(); if (href && areaName) { const fullUrl = href.startsWith('http') ? href : `${BASE_URL}/${href.replace('../', '')}`; sublists.push({ url: fullUrl, area: areaName, zone: zoneName }); } }); return sublists; } function determineResourceType(name, description = '') { const text = `${name} ${description}`.toLowerCase(); if (text.includes('food bank')) return 'food_bank'; if (text.includes('hamper')) return 'hamper'; if (text.includes('meal') || text.includes('soup') || text.includes('kitchen')) return 'community_meal'; if (text.includes('pantry')) return 'pantry'; if (text.includes('mobile')) return 'mobile_food'; if (text.includes('grocery')) return 'grocery_program'; return 'other'; } function extractCity(address, areaName) { // Common Alberta cities const cities = [ 'Edmonton', 'Calgary', 'Red Deer', 'Lethbridge', 'Medicine Hat', 'Grande Prairie', 'Fort McMurray', 'Airdrie', 'Spruce Grove', 'St. Albert', 'Sherwood Park', 'Leduc', 'Camrose', 'Lloydminster', 'Cold Lake', 'Wetaskiwin', 'Okotoks', 'Cochrane', 'Brooks', 'Banff', 'Canmore', 'High River', 'Stony Plain', 'Hinton', 'Slave Lake', 'Peace River', 'Drumheller', 'Barrhead', 'Edson', 'Whitecourt', 'Taber', 'Jasper', 'Athabasca', 'Bonnyville' ]; const text = `${address} ${areaName}`; for (const city of cities) { if (text.toLowerCase().includes(city.toLowerCase())) { return city; } } return areaName.split(',')[0].trim() || null; } async function geocodeAddress(address, city) { if (!address && !city) return null; const fullAddress = [address, city, 'Alberta', 'Canada'] .filter(Boolean) .join(', '); try { const result = await geocoding.forwardGeocode(fullAddress); if (result && result.latitude && result.longitude) { logger.info(`Geocoded "${fullAddress}" to ${result.latitude}, ${result.longitude}`); return { latitude: result.latitude, longitude: result.longitude }; } } catch (error) { logger.warn(`Failed to geocode "${fullAddress}": ${error.message}`); } return null; } async function saveResource(resource, sourceUrl) { const sourceId = sourceUrl.match(/serviceQueryId=(\d+)/) || sourceUrl.match(/cartId=(\d+)/) || [null, `${resource.name}-${resource.city || 'unknown'}`.replace(/\s+/g, '-')]; try { const result = await db.query(` INSERT INTO food_resources ( name, description, resource_type, address, city, latitude, longitude, phone, email, website, hours_of_operation, source, source_url, source_id, updated_at, last_verified_at ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, NOW(), NOW()) ON CONFLICT (source, source_id) DO UPDATE SET name = EXCLUDED.name, description = COALESCE(EXCLUDED.description, food_resources.description), address = COALESCE(EXCLUDED.address, food_resources.address), city = COALESCE(EXCLUDED.city, food_resources.city), latitude = COALESCE(EXCLUDED.latitude, food_resources.latitude), longitude = COALESCE(EXCLUDED.longitude, food_resources.longitude), phone = COALESCE(EXCLUDED.phone, food_resources.phone), email = COALESCE(EXCLUDED.email, food_resources.email), website = COALESCE(EXCLUDED.website, food_resources.website), hours_of_operation = COALESCE(EXCLUDED.hours_of_operation, food_resources.hours_of_operation), updated_at = NOW(), last_verified_at = NOW() RETURNING id, (xmax = 0) AS inserted `, [ resource.name, resource.description || null, resource.resource_type || 'other', resource.address || null, resource.city || null, resource.latitude || null, resource.longitude || null, resource.phone || null, resource.email || null, resource.website || null, resource.hours_of_operation || null, 'informalberta', sourceUrl, sourceId[1] ]); return result.rows[0]; } catch (error) { logger.error('Failed to save resource', { name: resource.name, error: error.message }); return null; } } async function scrapeInformAlberta() { logger.info('Starting InformAlberta scrape'); // Log scrape start const logResult = await db.query(` INSERT INTO scrape_logs (source, status) VALUES ('informalberta', 'running') RETURNING id `); const logId = logResult.rows[0].id; let totalFound = 0; let totalAdded = 0; let totalUpdated = 0; try { for (const zone of COMBO_LISTS) { logger.info(`Processing zone: ${zone.name}`); const sublists = await parseComboList(zone.id, zone.name); logger.info(`Found ${sublists.length} areas in ${zone.name}`); for (const sublist of sublists) { await delay(1000); // Rate limiting const resources = await parseSublist(sublist.url, zone.name); logger.info(`Found ${resources.length} resources in ${sublist.area}`); for (const res of resources) { totalFound++; // Fetch full details if we have a URL let fullResource = { ...res }; if (res.url && !res.fromList) { await delay(500); const details = await parseServicePage(res.url); if (details) { fullResource = { ...fullResource, ...details }; } } // Determine resource type and city fullResource.resource_type = determineResourceType( fullResource.name, fullResource.description ); fullResource.city = extractCity( fullResource.address || '', sublist.area ); // Geocode address to get coordinates if (fullResource.address || fullResource.city) { await delay(1500); // Rate limit geocoding const coords = await geocodeAddress(fullResource.address, fullResource.city); if (coords) { fullResource.latitude = coords.latitude; fullResource.longitude = coords.longitude; } } const saved = await saveResource(fullResource, res.url || sublist.url); if (saved) { if (saved.inserted) { totalAdded++; } else { totalUpdated++; } } } } } // Update scrape log await db.query(` UPDATE scrape_logs SET completed_at = NOW(), status = 'completed', records_found = $1, records_added = $2, records_updated = $3 WHERE id = $4 `, [totalFound, totalAdded, totalUpdated, logId]); logger.info('InformAlberta scrape completed', { found: totalFound, added: totalAdded, updated: totalUpdated }); return { found: totalFound, added: totalAdded, updated: totalUpdated }; } catch (error) { await db.query(` UPDATE scrape_logs SET completed_at = NOW(), status = 'failed', error_message = $1 WHERE id = $2 `, [error.message, logId]); logger.error('InformAlberta scrape failed', { error: error.message }); throw error; } } // Run if called directly if (require.main === module) { scrapeInformAlberta() .then(result => { console.log('Scrape completed:', result); process.exit(0); }) .catch(err => { console.error('Scrape failed:', err); process.exit(1); }); } module.exports = { scrapeInformAlberta };