1796 字
9 分钟
Web自动化最佳实践:从脚本到生产级解决方案
Web自动化最佳实践:从脚本到生产级解决方案
引言
Web自动化已成为现代软件开发中不可或缺的一部分。无论是数据采集、测试自动化还是业务流程自动化,都需要高效、可靠的技术方案。本文将分享从简单脚本到生产级Web自动化解决方案的完整实践指南。
架构设计原则
1. 分层架构
// 分层架构示例const automationArchitecture = { // 控制层 controller: { strategy: 'page-object', flow: 'sequential' },
// 业务逻辑层 services: { authentication: 'auth-service', dataProcessing: 'processor-service', errorHandling: 'error-handler' },
// 基础设施层 infrastructure: { browser: 'puppeteer', storage: 'file-system', logging: 'structured-logs' }};2. 模块化设计
将功能拆分为独立的模块,便于维护和扩展:
modules/├── auth/ // 认证模块│ ├── login.js│ └── session.js├── navigation/ // 导航模块│ ├── page-loader.js│ └── url-manager.js├── interaction/ // 交互模块│ ├── form-handler.js│ └── element-locator.js└── data/ // 数据处理模块 ├── extractor.js └── validator.js核心技术实现
1. 浏览器自动化
Puppeteer高级配置
const puppeteer = require('puppeteer');const browser = await puppeteer.launch({ headless: 'new', args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-web-security', '--disable-features=TranslateUI', '--disable-ipc-flooding-protection' ], ignoreHTTPSErrors: true, defaultViewport: null, userDataDir: './user_data'});多页面管理
class PageManager { constructor(browser) { this.browser = browser; this.pages = new Map(); }
async createPage(name, options = {}) { const page = await this.browser.newPage(); page.setDefaultTimeout(options.timeout || 30000); this.pages.set(name, page); return page; }
getPage(name) { return this.pages.get(name); }
async closeAll() { for (const [name, page] of this.pages) { try { await page.close(); } catch (error) { console.warn(`关闭页面 ${name} 时出错:`, error.message); } } this.pages.clear(); }}2. 智能元素定位
多重定位策略
class ElementLocator { constructor(page) { this.page = page; this.strategies = [ 'id', 'css', 'xpath', 'text', 'name', 'label' ]; }
async findElement(selectors) { for (const strategy of this.strategies) { try { const element = await this.tryLocate(selectors, strategy); if (element) { return element; } } catch (error) { continue; } } throw new Error(`无法定位元素: ${JSON.stringify(selectors)}`); }
async tryLocate(selectors, strategy) { const selector = this.buildSelector(selectors, strategy); switch (strategy) { case 'id': return this.page.$(`#${selector}`); case 'css': return this.page.$(selector); case 'xpath': return this.page.$(`xpath=${selector}`); case 'text': return this.page.$(`text=${selector}`); default: return null; } }
buildSelector(selectors, strategy) { if (selectors[strategy]) { return selectors[strategy]; } return selectors.text || selectors.name; }}3. 错误处理与恢复机制
全局错误处理
class ErrorHandler { constructor() { this.retryCount = 3; this.retryDelay = 2000; this.maxRetryCount = 5; }
async executeWithRetry(fn, context = {}) { let attempts = 0; let lastError = null;
while (attempts < this.retryCount) { try { return await fn(); } catch (error) { attempts++; lastError = error;
console.warn(`执行失败 (尝试 ${attempts}/${this.retryCount}):`, error.message);
if (this.isRecoverable(error)) { await this.sleep(this.retryDelay); await this.recoveryAction(context); } else { throw error; } } }
throw new Error(`达到最大重试次数 (${this.retryCount})`, { cause: lastError }); }
isRecoverable(error) { const recoverableErrors = [ 'TIMEOUT', 'ELEMENT_NOT_INTERACTABLE', 'NETWORK_ERROR', 'CONNECTION_REFUSED' ];
return recoverableErrors.some(type => error.message.includes(type)); }
async recoveryAction(context) { if (context.page) { await context.page.reload(); }
if (context.session) { await context.session.refresh(); } }
sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); }}4. 性能优化
缓存机制
class CacheManager { constructor(cacheTTL = 3600000) { // 默认1小时 this.cache = new Map(); this.ttl = cacheTTL; }
get(key) { const item = this.cache.get(key); if (item && Date.now() - item.timestamp < this.ttl) { return item.data; } return null; }
set(key, data) { this.cache.set(key, { data, timestamp: Date.now() }); }
clear() { this.cache.clear(); }
cleanup() { const now = Date.now(); for (const [key, item] of this.cache.entries()) { if (now - item.timestamp > this.ttl) { this.cache.delete(key); } } }}并发控制
class ConcurrentExecutor { constructor(maxConcurrency = 3) { this.maxConcurrency = maxConcurrency; this.running = new Set(); this.queue = []; }
async addTask(task) { return new Promise((resolve, reject) => { this.queue.push({ task, resolve, reject }); this.processQueue(); }); }
async processQueue() { while (this.running.size < this.maxConcurrency && this.queue.length > 0) { const { task, resolve, reject } = this.queue.shift(); const promise = task(); this.running.add(promise);
promise .then(result => { this.running.delete(promise); resolve(result); this.processQueue(); }) .catch(error => { this.running.delete(promise); reject(error); this.processQueue(); }); } }}监控与日志
1. 结构化日志
class Logger { constructor() { this.levels = { error: 0, warn: 1, info: 2, debug: 3 }; this.currentLevel = 'info'; }
log(level, message, data = {}) { if (this.levels[level] <= this.levels[this.currentLevel]) { const logEntry = { timestamp: new Date().toISOString(), level, message, data };
console.log(JSON.stringify(logEntry));
// 也可写入文件或发送到监控系统 if (process.env.NODE_ENV === 'production') { this.writeToFile(logEntry); } } }
error(message, data) { this.log('error', message, data); }
warn(message, data) { this.log('warn', message, data); }
info(message, data) { this.log('info', message, data); }
debug(message, data) { this.log('debug', message, data); }
writeToFile(logEntry) { // 实现文件写入逻辑 }}2. 性能监控
class PerformanceMonitor { constructor() { this.metrics = { requestTimes: [], errorCounts: {}, successRates: [] }; }
startTimer(operation) { return { operation, startTime: Date.now() }; }
endTimer(timer) { const duration = Date.now() - timer.startTime; this.metrics.requestTimes.push({ operation: timer.operation, duration, timestamp: Date.now() });
return duration; }
getMetrics() { return { ...this.metrics, averageResponseTime: this.calculateAverageTime(), errorRate: this.calculateErrorRate() }; }
calculateAverageTime() { if (this.metrics.requestTimes.length === 0) return 0;
const totalTime = this.metrics.requestTimes.reduce( (sum, item) => sum + item.duration, 0 );
return totalTime / this.metrics.requestTimes.length; }
calculateErrorRate() { // 计算错误率逻辑 return 0; // 示例值 }}部署策略
1. Docker化部署
FROM node:18-alpine
# 安装ChromeRUN apk add --no-cache \ chromium \ nss \ freetype \ harfbuzz \ ca-certificates \ ttf-freefont
# 设置Chrome路径ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser
# 设置工作目录WORKDIR /app
# 复制package文件COPY package*.json ./
# 安装依赖RUN npm ci --only=production
# 复制应用代码COPY . .
# 构建应用RUN npm run build
# 暴露端口EXPOSE 3000
# 启动应用CMD ["node", "dist/app.js"]2. 环境变量配置
module.exports = { database: { host: process.env.DB_HOST || 'localhost', port: process.env.DB_PORT || 5432, username: process.env.DB_USERNAME, password: process.env.DB_PASSWORD },
automation: { maxRetries: parseInt(process.env.MAX_RETRIES) || 3, timeout: parseInt(process.env.TIMEOUT) || 30000, headless: process.env.HEADLESS !== 'false' },
logging: { level: process.env.LOG_LEVEL || 'info', file: process.env.LOG_FILE || 'automation.log' }};实际应用案例
1. 电商价格监控
class PriceMonitor { constructor(url, selectors) { this.url = url; this.selectors = selectors; this.priceHistory = []; }
async getCurrentPrice() { const browser = await puppeteer.launch({ headless: 'new' }); const page = await browser.newPage();
try { await page.goto(this.url, { waitUntil: 'networkidle2' }); await page.waitForSelector(this.selectors.price);
const priceElement = await page.$(this.selectors.price); const priceText = await priceElement.evaluate(el => el.textContent);
const price = this.extractPrice(priceText);
this.priceHistory.push({ timestamp: new Date(), price });
return price; } finally { await browser.close(); } }
extractPrice(text) { // 价格提取逻辑 const match = text.match(/(\d+\.?\d*)/); return match ? parseFloat(match[1]) : null; }
getPriceHistory() { return this.priceHistory; }}2. 社交媒体数据采集
class SocialMediaScraper { constructor(platform) { this.platform = platform; this.userAgent = this.getUserAgent(); }
async scrapeProfile(profileUrl) { const browser = await puppeteer.launch({ headless: 'new', args: [`--user-agent=${this.userAgent}`] });
const page = await browser.newPage();
try { await page.goto(profileUrl, { waitUntil: 'networkidle2', timeout: 60000 });
const data = await this.extractProfileData(page);
return { profile: data, timestamp: new Date() }; } finally { await browser.close(); } }
async extractProfileData(page) { // 数据提取逻辑 return { followers: await this.getFollowersCount(page), posts: await this.getPostsCount(page), bio: await this.getBio(page) }; }
getUserAgent() { const userAgents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36' ];
return userAgents[Math.floor(Math.random() * userAgents.length)]; }}性能优化技巧
1. 并发处理
async function processMultipleUrls(urls, batchSize = 3) { const results = []; const batches = [];
// 将URL分批 for (let i = 0; i < urls.length; i += batchSize) { batches.push(urls.slice(i, i + batchSize)); }
// 并行处理每批 for (const batch of batches) { const batchPromises = batch.map(async (url) => { return await processUrl(url); });
const batchResults = await Promise.all(batchPromises); results.push(...batchResults); }
return results;}2. 智能等待策略
class SmartWaiter { constructor(page) { this.page = page; this.defaultTimeout = 30000; this.maxTimeout = 120000; }
async waitForElement(selector, options = {}) { const startTime = Date.now(); let timeout = options.timeout || this.defaultTimeout;
return new Promise(async (resolve, reject) => { try { await this.page.waitForSelector(selector, { timeout, visible: options.visible || true }); resolve(await this.page.$(selector)); } catch (error) { if (error.name === 'TimeoutError') { const elapsed = Date.now() - startTime; if (elapsed < this.maxTimeout) { // 尝试恢复策略 await this.recoveryStrategy(); // 重试 return this.waitForElement(selector, { timeout: Math.min(timeout * 1.5, this.maxTimeout), ...options }).then(resolve).catch(reject); } } reject(error); } }); }
async recoveryStrategy() { // 恢复策略:刷新页面、重新登录等 try { await this.page.reload({ waitUntil: 'networkidle2' }); } catch (error) { console.warn('恢复策略失败:', error.message); } }}总结
构建生产级Web自动化解决方案需要综合考虑架构设计、错误处理、性能优化等多个方面。通过模块化设计、智能定位策略、完善的错误处理机制和性能优化手段,可以打造出稳定、高效的自动化解决方案。
最重要的是,持续的监控和改进是确保自动化系统长期稳定运行的关键。通过收集运行数据、分析性能指标,不断优化和完善系统,才能适应不断变化的环境需求。
本文首发于Fuwari博客,欢迎交流讨论