398 lines
12 KiB
JavaScript
398 lines
12 KiB
JavaScript
const axios = require('axios');
|
|
const cheerio = require('cheerio');
|
|
const db = require('../models/db');
|
|
const logger = require('../utils/logger');
|
|
const geocoding = require('../services/geocoding');
|
|
|
|
const BASE_URL = 'https://informalberta.ca/public/common';
|
|
const COMBO_LISTS = [
|
|
{ id: '1004954', name: 'North Zone' },
|
|
{ id: '1004953', name: 'Edmonton Zone' },
|
|
{ id: '1004951', name: 'Calgary Zone' },
|
|
{ id: '1004952', name: 'Central Zone' },
|
|
{ id: '1004903', name: 'South Zone' }
|
|
];
|
|
|
|
// Rate limiting - be respectful to the server
|
|
const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));
|
|
|
|
async function fetchPage(url) {
|
|
try {
|
|
const response = await axios.get(url, {
|
|
headers: {
|
|
'User-Agent': 'FreeAlbertaFoodBot/1.0 (contact@freealberta.org)',
|
|
'Accept': 'text/html,application/xhtml+xml'
|
|
},
|
|
timeout: 30000
|
|
});
|
|
return response.data;
|
|
} catch (error) {
|
|
logger.error(`Failed to fetch ${url}`, { error: error.message });
|
|
return null;
|
|
}
|
|
}
|
|
|
|
async function parseServicePage(serviceUrl) {
|
|
const html = await fetchPage(serviceUrl);
|
|
if (!html) return null;
|
|
|
|
const $ = cheerio.load(html);
|
|
const resource = {};
|
|
|
|
// Parse service profile page - structure may vary
|
|
resource.name = $('h1').first().text().trim() ||
|
|
$('.service-name').text().trim() ||
|
|
$('title').text().split('|')[0].trim();
|
|
|
|
// Try multiple selectors for different page structures
|
|
const addressBlock = $('.address, .location-address, [class*="address"]').first();
|
|
if (addressBlock.length) {
|
|
resource.address = addressBlock.text().replace(/\s+/g, ' ').trim();
|
|
}
|
|
|
|
// Look for phone numbers
|
|
const phoneMatch = html.match(/(\d{3}[-.\s]?\d{3}[-.\s]?\d{4})/);
|
|
if (phoneMatch) {
|
|
resource.phone = phoneMatch[1];
|
|
}
|
|
|
|
// Look for description/service info
|
|
const descriptionSelectors = [
|
|
'.service-description',
|
|
'.description',
|
|
'#description',
|
|
'.program-description',
|
|
'[class*="description"]'
|
|
];
|
|
|
|
for (const selector of descriptionSelectors) {
|
|
const desc = $(selector).first().text().trim();
|
|
if (desc && desc.length > 20) {
|
|
resource.description = desc;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Hours of operation
|
|
const hoursMatch = html.match(/hours?[:\s]*([\w\s\d:;,.-]+)/i);
|
|
if (hoursMatch) {
|
|
resource.hours_of_operation = hoursMatch[1].trim();
|
|
}
|
|
|
|
// Email
|
|
const emailMatch = html.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/);
|
|
if (emailMatch) {
|
|
resource.email = emailMatch[0];
|
|
}
|
|
|
|
// Website
|
|
const websiteMatch = html.match(/https?:\/\/[^\s"'<>]+/g);
|
|
if (websiteMatch) {
|
|
const externalSite = websiteMatch.find(url =>
|
|
!url.includes('informalberta.ca') &&
|
|
!url.includes('facebook.com') &&
|
|
url.length < 200
|
|
);
|
|
if (externalSite) {
|
|
resource.website = externalSite;
|
|
}
|
|
}
|
|
|
|
return resource;
|
|
}
|
|
|
|
async function parseSublist(sublistUrl, zone) {
|
|
const html = await fetchPage(sublistUrl);
|
|
if (!html) return [];
|
|
|
|
const $ = cheerio.load(html);
|
|
const resources = [];
|
|
|
|
// Find all service links
|
|
$('a[href*="serviceProfileStyled.do"], a[href*="serviceQueryId"]').each((_, el) => {
|
|
const href = $(el).attr('href');
|
|
const name = $(el).text().trim();
|
|
if (href && name) {
|
|
resources.push({
|
|
name,
|
|
url: href.startsWith('http') ? href : `${BASE_URL}/${href.replace('../', '')}`,
|
|
zone
|
|
});
|
|
}
|
|
});
|
|
|
|
// Also look for direct resource info in list format
|
|
$('.service-item, .resource-item, [class*="service"]').each((_, el) => {
|
|
const name = $(el).find('.name, .title, h3, h4').first().text().trim();
|
|
const address = $(el).find('.address, .location').first().text().trim();
|
|
const phone = $(el).find('.phone, .tel').first().text().trim();
|
|
|
|
if (name) {
|
|
resources.push({
|
|
name,
|
|
address,
|
|
phone,
|
|
zone,
|
|
fromList: true
|
|
});
|
|
}
|
|
});
|
|
|
|
return resources;
|
|
}
|
|
|
|
async function parseComboList(comboListId, zoneName) {
|
|
const url = `${BASE_URL}/viewComboList.do?comboListId=${comboListId}`;
|
|
const html = await fetchPage(url);
|
|
if (!html) return [];
|
|
|
|
const $ = cheerio.load(html);
|
|
const sublists = [];
|
|
|
|
// Find all sublist links
|
|
$('a[href*="viewSublist.do"], a[href*="cartId"]').each((_, el) => {
|
|
const href = $(el).attr('href');
|
|
const areaName = $(el).text().trim();
|
|
if (href && areaName) {
|
|
const fullUrl = href.startsWith('http') ? href : `${BASE_URL}/${href.replace('../', '')}`;
|
|
sublists.push({
|
|
url: fullUrl,
|
|
area: areaName,
|
|
zone: zoneName
|
|
});
|
|
}
|
|
});
|
|
|
|
return sublists;
|
|
}
|
|
|
|
function determineResourceType(name, description = '') {
|
|
const text = `${name} ${description}`.toLowerCase();
|
|
|
|
if (text.includes('food bank')) return 'food_bank';
|
|
if (text.includes('hamper')) return 'hamper';
|
|
if (text.includes('meal') || text.includes('soup') || text.includes('kitchen')) return 'community_meal';
|
|
if (text.includes('pantry')) return 'pantry';
|
|
if (text.includes('mobile')) return 'mobile_food';
|
|
if (text.includes('grocery')) return 'grocery_program';
|
|
return 'other';
|
|
}
|
|
|
|
function extractCity(address, areaName) {
|
|
// Common Alberta cities
|
|
const cities = [
|
|
'Edmonton', 'Calgary', 'Red Deer', 'Lethbridge', 'Medicine Hat',
|
|
'Grande Prairie', 'Fort McMurray', 'Airdrie', 'Spruce Grove',
|
|
'St. Albert', 'Sherwood Park', 'Leduc', 'Camrose', 'Lloydminster',
|
|
'Cold Lake', 'Wetaskiwin', 'Okotoks', 'Cochrane', 'Brooks',
|
|
'Banff', 'Canmore', 'High River', 'Stony Plain', 'Hinton',
|
|
'Slave Lake', 'Peace River', 'Drumheller', 'Barrhead', 'Edson',
|
|
'Whitecourt', 'Taber', 'Jasper', 'Athabasca', 'Bonnyville'
|
|
];
|
|
|
|
const text = `${address} ${areaName}`;
|
|
for (const city of cities) {
|
|
if (text.toLowerCase().includes(city.toLowerCase())) {
|
|
return city;
|
|
}
|
|
}
|
|
return areaName.split(',')[0].trim() || null;
|
|
}
|
|
|
|
async function geocodeAddress(address, city) {
|
|
if (!address && !city) return null;
|
|
|
|
const fullAddress = [address, city, 'Alberta', 'Canada']
|
|
.filter(Boolean)
|
|
.join(', ');
|
|
|
|
try {
|
|
const result = await geocoding.forwardGeocode(fullAddress);
|
|
if (result && result.latitude && result.longitude) {
|
|
logger.info(`Geocoded "${fullAddress}" to ${result.latitude}, ${result.longitude}`);
|
|
return {
|
|
latitude: result.latitude,
|
|
longitude: result.longitude
|
|
};
|
|
}
|
|
} catch (error) {
|
|
logger.warn(`Failed to geocode "${fullAddress}": ${error.message}`);
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
async function saveResource(resource, sourceUrl) {
|
|
const sourceId = sourceUrl.match(/serviceQueryId=(\d+)/) ||
|
|
sourceUrl.match(/cartId=(\d+)/) ||
|
|
[null, `${resource.name}-${resource.city || 'unknown'}`.replace(/\s+/g, '-')];
|
|
|
|
try {
|
|
const result = await db.query(`
|
|
INSERT INTO food_resources (
|
|
name, description, resource_type,
|
|
address, city, latitude, longitude,
|
|
phone, email, website,
|
|
hours_of_operation, source, source_url, source_id,
|
|
updated_at, last_verified_at
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, NOW(), NOW())
|
|
ON CONFLICT (source, source_id)
|
|
DO UPDATE SET
|
|
name = EXCLUDED.name,
|
|
description = COALESCE(EXCLUDED.description, food_resources.description),
|
|
address = COALESCE(EXCLUDED.address, food_resources.address),
|
|
city = COALESCE(EXCLUDED.city, food_resources.city),
|
|
latitude = COALESCE(EXCLUDED.latitude, food_resources.latitude),
|
|
longitude = COALESCE(EXCLUDED.longitude, food_resources.longitude),
|
|
phone = COALESCE(EXCLUDED.phone, food_resources.phone),
|
|
email = COALESCE(EXCLUDED.email, food_resources.email),
|
|
website = COALESCE(EXCLUDED.website, food_resources.website),
|
|
hours_of_operation = COALESCE(EXCLUDED.hours_of_operation, food_resources.hours_of_operation),
|
|
updated_at = NOW(),
|
|
last_verified_at = NOW()
|
|
RETURNING id, (xmax = 0) AS inserted
|
|
`, [
|
|
resource.name,
|
|
resource.description || null,
|
|
resource.resource_type || 'other',
|
|
resource.address || null,
|
|
resource.city || null,
|
|
resource.latitude || null,
|
|
resource.longitude || null,
|
|
resource.phone || null,
|
|
resource.email || null,
|
|
resource.website || null,
|
|
resource.hours_of_operation || null,
|
|
'informalberta',
|
|
sourceUrl,
|
|
sourceId[1]
|
|
]);
|
|
|
|
return result.rows[0];
|
|
} catch (error) {
|
|
logger.error('Failed to save resource', { name: resource.name, error: error.message });
|
|
return null;
|
|
}
|
|
}
|
|
|
|
async function scrapeInformAlberta() {
|
|
logger.info('Starting InformAlberta scrape');
|
|
|
|
// Log scrape start
|
|
const logResult = await db.query(`
|
|
INSERT INTO scrape_logs (source, status)
|
|
VALUES ('informalberta', 'running')
|
|
RETURNING id
|
|
`);
|
|
const logId = logResult.rows[0].id;
|
|
|
|
let totalFound = 0;
|
|
let totalAdded = 0;
|
|
let totalUpdated = 0;
|
|
|
|
try {
|
|
for (const zone of COMBO_LISTS) {
|
|
logger.info(`Processing zone: ${zone.name}`);
|
|
|
|
const sublists = await parseComboList(zone.id, zone.name);
|
|
logger.info(`Found ${sublists.length} areas in ${zone.name}`);
|
|
|
|
for (const sublist of sublists) {
|
|
await delay(1000); // Rate limiting
|
|
|
|
const resources = await parseSublist(sublist.url, zone.name);
|
|
logger.info(`Found ${resources.length} resources in ${sublist.area}`);
|
|
|
|
for (const res of resources) {
|
|
totalFound++;
|
|
|
|
// Fetch full details if we have a URL
|
|
let fullResource = { ...res };
|
|
if (res.url && !res.fromList) {
|
|
await delay(500);
|
|
const details = await parseServicePage(res.url);
|
|
if (details) {
|
|
fullResource = { ...fullResource, ...details };
|
|
}
|
|
}
|
|
|
|
// Determine resource type and city
|
|
fullResource.resource_type = determineResourceType(
|
|
fullResource.name,
|
|
fullResource.description
|
|
);
|
|
fullResource.city = extractCity(
|
|
fullResource.address || '',
|
|
sublist.area
|
|
);
|
|
|
|
// Geocode address to get coordinates
|
|
if (fullResource.address || fullResource.city) {
|
|
await delay(1500); // Rate limit geocoding
|
|
const coords = await geocodeAddress(fullResource.address, fullResource.city);
|
|
if (coords) {
|
|
fullResource.latitude = coords.latitude;
|
|
fullResource.longitude = coords.longitude;
|
|
}
|
|
}
|
|
|
|
const saved = await saveResource(fullResource, res.url || sublist.url);
|
|
if (saved) {
|
|
if (saved.inserted) {
|
|
totalAdded++;
|
|
} else {
|
|
totalUpdated++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Update scrape log
|
|
await db.query(`
|
|
UPDATE scrape_logs
|
|
SET completed_at = NOW(),
|
|
status = 'completed',
|
|
records_found = $1,
|
|
records_added = $2,
|
|
records_updated = $3
|
|
WHERE id = $4
|
|
`, [totalFound, totalAdded, totalUpdated, logId]);
|
|
|
|
logger.info('InformAlberta scrape completed', {
|
|
found: totalFound,
|
|
added: totalAdded,
|
|
updated: totalUpdated
|
|
});
|
|
|
|
return { found: totalFound, added: totalAdded, updated: totalUpdated };
|
|
|
|
} catch (error) {
|
|
await db.query(`
|
|
UPDATE scrape_logs
|
|
SET completed_at = NOW(),
|
|
status = 'failed',
|
|
error_message = $1
|
|
WHERE id = $2
|
|
`, [error.message, logId]);
|
|
|
|
logger.error('InformAlberta scrape failed', { error: error.message });
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
// Run if called directly
|
|
if (require.main === module) {
|
|
scrapeInformAlberta()
|
|
.then(result => {
|
|
console.log('Scrape completed:', result);
|
|
process.exit(0);
|
|
})
|
|
.catch(err => {
|
|
console.error('Scrape failed:', err);
|
|
process.exit(1);
|
|
});
|
|
}
|
|
|
|
module.exports = { scrapeInformAlberta };
|