398 lines
12 KiB
JavaScript

const axios = require('axios');
const cheerio = require('cheerio');
const db = require('../models/db');
const logger = require('../utils/logger');
const geocoding = require('../services/geocoding');
const BASE_URL = 'https://informalberta.ca/public/common';
const COMBO_LISTS = [
{ id: '1004954', name: 'North Zone' },
{ id: '1004953', name: 'Edmonton Zone' },
{ id: '1004951', name: 'Calgary Zone' },
{ id: '1004952', name: 'Central Zone' },
{ id: '1004903', name: 'South Zone' }
];
// Rate limiting - be respectful to the server
const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));
async function fetchPage(url) {
try {
const response = await axios.get(url, {
headers: {
'User-Agent': 'FreeAlbertaFoodBot/1.0 (contact@freealberta.org)',
'Accept': 'text/html,application/xhtml+xml'
},
timeout: 30000
});
return response.data;
} catch (error) {
logger.error(`Failed to fetch ${url}`, { error: error.message });
return null;
}
}
async function parseServicePage(serviceUrl) {
const html = await fetchPage(serviceUrl);
if (!html) return null;
const $ = cheerio.load(html);
const resource = {};
// Parse service profile page - structure may vary
resource.name = $('h1').first().text().trim() ||
$('.service-name').text().trim() ||
$('title').text().split('|')[0].trim();
// Try multiple selectors for different page structures
const addressBlock = $('.address, .location-address, [class*="address"]').first();
if (addressBlock.length) {
resource.address = addressBlock.text().replace(/\s+/g, ' ').trim();
}
// Look for phone numbers
const phoneMatch = html.match(/(\d{3}[-.\s]?\d{3}[-.\s]?\d{4})/);
if (phoneMatch) {
resource.phone = phoneMatch[1];
}
// Look for description/service info
const descriptionSelectors = [
'.service-description',
'.description',
'#description',
'.program-description',
'[class*="description"]'
];
for (const selector of descriptionSelectors) {
const desc = $(selector).first().text().trim();
if (desc && desc.length > 20) {
resource.description = desc;
break;
}
}
// Hours of operation
const hoursMatch = html.match(/hours?[:\s]*([\w\s\d:;,.-]+)/i);
if (hoursMatch) {
resource.hours_of_operation = hoursMatch[1].trim();
}
// Email
const emailMatch = html.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/);
if (emailMatch) {
resource.email = emailMatch[0];
}
// Website
const websiteMatch = html.match(/https?:\/\/[^\s"'<>]+/g);
if (websiteMatch) {
const externalSite = websiteMatch.find(url =>
!url.includes('informalberta.ca') &&
!url.includes('facebook.com') &&
url.length < 200
);
if (externalSite) {
resource.website = externalSite;
}
}
return resource;
}
async function parseSublist(sublistUrl, zone) {
const html = await fetchPage(sublistUrl);
if (!html) return [];
const $ = cheerio.load(html);
const resources = [];
// Find all service links
$('a[href*="serviceProfileStyled.do"], a[href*="serviceQueryId"]').each((_, el) => {
const href = $(el).attr('href');
const name = $(el).text().trim();
if (href && name) {
resources.push({
name,
url: href.startsWith('http') ? href : `${BASE_URL}/${href.replace('../', '')}`,
zone
});
}
});
// Also look for direct resource info in list format
$('.service-item, .resource-item, [class*="service"]').each((_, el) => {
const name = $(el).find('.name, .title, h3, h4').first().text().trim();
const address = $(el).find('.address, .location').first().text().trim();
const phone = $(el).find('.phone, .tel').first().text().trim();
if (name) {
resources.push({
name,
address,
phone,
zone,
fromList: true
});
}
});
return resources;
}
async function parseComboList(comboListId, zoneName) {
const url = `${BASE_URL}/viewComboList.do?comboListId=${comboListId}`;
const html = await fetchPage(url);
if (!html) return [];
const $ = cheerio.load(html);
const sublists = [];
// Find all sublist links
$('a[href*="viewSublist.do"], a[href*="cartId"]').each((_, el) => {
const href = $(el).attr('href');
const areaName = $(el).text().trim();
if (href && areaName) {
const fullUrl = href.startsWith('http') ? href : `${BASE_URL}/${href.replace('../', '')}`;
sublists.push({
url: fullUrl,
area: areaName,
zone: zoneName
});
}
});
return sublists;
}
function determineResourceType(name, description = '') {
const text = `${name} ${description}`.toLowerCase();
if (text.includes('food bank')) return 'food_bank';
if (text.includes('hamper')) return 'hamper';
if (text.includes('meal') || text.includes('soup') || text.includes('kitchen')) return 'community_meal';
if (text.includes('pantry')) return 'pantry';
if (text.includes('mobile')) return 'mobile_food';
if (text.includes('grocery')) return 'grocery_program';
return 'other';
}
function extractCity(address, areaName) {
// Common Alberta cities
const cities = [
'Edmonton', 'Calgary', 'Red Deer', 'Lethbridge', 'Medicine Hat',
'Grande Prairie', 'Fort McMurray', 'Airdrie', 'Spruce Grove',
'St. Albert', 'Sherwood Park', 'Leduc', 'Camrose', 'Lloydminster',
'Cold Lake', 'Wetaskiwin', 'Okotoks', 'Cochrane', 'Brooks',
'Banff', 'Canmore', 'High River', 'Stony Plain', 'Hinton',
'Slave Lake', 'Peace River', 'Drumheller', 'Barrhead', 'Edson',
'Whitecourt', 'Taber', 'Jasper', 'Athabasca', 'Bonnyville'
];
const text = `${address} ${areaName}`;
for (const city of cities) {
if (text.toLowerCase().includes(city.toLowerCase())) {
return city;
}
}
return areaName.split(',')[0].trim() || null;
}
async function geocodeAddress(address, city) {
if (!address && !city) return null;
const fullAddress = [address, city, 'Alberta', 'Canada']
.filter(Boolean)
.join(', ');
try {
const result = await geocoding.forwardGeocode(fullAddress);
if (result && result.latitude && result.longitude) {
logger.info(`Geocoded "${fullAddress}" to ${result.latitude}, ${result.longitude}`);
return {
latitude: result.latitude,
longitude: result.longitude
};
}
} catch (error) {
logger.warn(`Failed to geocode "${fullAddress}": ${error.message}`);
}
return null;
}
async function saveResource(resource, sourceUrl) {
const sourceId = sourceUrl.match(/serviceQueryId=(\d+)/) ||
sourceUrl.match(/cartId=(\d+)/) ||
[null, `${resource.name}-${resource.city || 'unknown'}`.replace(/\s+/g, '-')];
try {
const result = await db.query(`
INSERT INTO food_resources (
name, description, resource_type,
address, city, latitude, longitude,
phone, email, website,
hours_of_operation, source, source_url, source_id,
updated_at, last_verified_at
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, NOW(), NOW())
ON CONFLICT (source, source_id)
DO UPDATE SET
name = EXCLUDED.name,
description = COALESCE(EXCLUDED.description, food_resources.description),
address = COALESCE(EXCLUDED.address, food_resources.address),
city = COALESCE(EXCLUDED.city, food_resources.city),
latitude = COALESCE(EXCLUDED.latitude, food_resources.latitude),
longitude = COALESCE(EXCLUDED.longitude, food_resources.longitude),
phone = COALESCE(EXCLUDED.phone, food_resources.phone),
email = COALESCE(EXCLUDED.email, food_resources.email),
website = COALESCE(EXCLUDED.website, food_resources.website),
hours_of_operation = COALESCE(EXCLUDED.hours_of_operation, food_resources.hours_of_operation),
updated_at = NOW(),
last_verified_at = NOW()
RETURNING id, (xmax = 0) AS inserted
`, [
resource.name,
resource.description || null,
resource.resource_type || 'other',
resource.address || null,
resource.city || null,
resource.latitude || null,
resource.longitude || null,
resource.phone || null,
resource.email || null,
resource.website || null,
resource.hours_of_operation || null,
'informalberta',
sourceUrl,
sourceId[1]
]);
return result.rows[0];
} catch (error) {
logger.error('Failed to save resource', { name: resource.name, error: error.message });
return null;
}
}
async function scrapeInformAlberta() {
logger.info('Starting InformAlberta scrape');
// Log scrape start
const logResult = await db.query(`
INSERT INTO scrape_logs (source, status)
VALUES ('informalberta', 'running')
RETURNING id
`);
const logId = logResult.rows[0].id;
let totalFound = 0;
let totalAdded = 0;
let totalUpdated = 0;
try {
for (const zone of COMBO_LISTS) {
logger.info(`Processing zone: ${zone.name}`);
const sublists = await parseComboList(zone.id, zone.name);
logger.info(`Found ${sublists.length} areas in ${zone.name}`);
for (const sublist of sublists) {
await delay(1000); // Rate limiting
const resources = await parseSublist(sublist.url, zone.name);
logger.info(`Found ${resources.length} resources in ${sublist.area}`);
for (const res of resources) {
totalFound++;
// Fetch full details if we have a URL
let fullResource = { ...res };
if (res.url && !res.fromList) {
await delay(500);
const details = await parseServicePage(res.url);
if (details) {
fullResource = { ...fullResource, ...details };
}
}
// Determine resource type and city
fullResource.resource_type = determineResourceType(
fullResource.name,
fullResource.description
);
fullResource.city = extractCity(
fullResource.address || '',
sublist.area
);
// Geocode address to get coordinates
if (fullResource.address || fullResource.city) {
await delay(1500); // Rate limit geocoding
const coords = await geocodeAddress(fullResource.address, fullResource.city);
if (coords) {
fullResource.latitude = coords.latitude;
fullResource.longitude = coords.longitude;
}
}
const saved = await saveResource(fullResource, res.url || sublist.url);
if (saved) {
if (saved.inserted) {
totalAdded++;
} else {
totalUpdated++;
}
}
}
}
}
// Update scrape log
await db.query(`
UPDATE scrape_logs
SET completed_at = NOW(),
status = 'completed',
records_found = $1,
records_added = $2,
records_updated = $3
WHERE id = $4
`, [totalFound, totalAdded, totalUpdated, logId]);
logger.info('InformAlberta scrape completed', {
found: totalFound,
added: totalAdded,
updated: totalUpdated
});
return { found: totalFound, added: totalAdded, updated: totalUpdated };
} catch (error) {
await db.query(`
UPDATE scrape_logs
SET completed_at = NOW(),
status = 'failed',
error_message = $1
WHERE id = $2
`, [error.message, logId]);
logger.error('InformAlberta scrape failed', { error: error.message });
throw error;
}
}
// Run if called directly
if (require.main === module) {
scrapeInformAlberta()
.then(result => {
console.log('Scrape completed:', result);
process.exit(0);
})
.catch(err => {
console.error('Scrape failed:', err);
process.exit(1);
});
}
module.exports = { scrapeInformAlberta };