1796 字
9 分钟
Web自动化最佳实践:从脚本到生产级解决方案

Web自动化最佳实践:从脚本到生产级解决方案#

引言#

Web自动化已成为现代软件开发中不可或缺的一部分。无论是数据采集、测试自动化还是业务流程自动化,都需要高效、可靠的技术方案。本文将分享从简单脚本到生产级Web自动化解决方案的完整实践指南。

架构设计原则#

1. 分层架构#

// 分层架构示例
const automationArchitecture = {
// 控制层
controller: {
strategy: 'page-object',
flow: 'sequential'
},
// 业务逻辑层
services: {
authentication: 'auth-service',
dataProcessing: 'processor-service',
errorHandling: 'error-handler'
},
// 基础设施层
infrastructure: {
browser: 'puppeteer',
storage: 'file-system',
logging: 'structured-logs'
}
};

2. 模块化设计#

将功能拆分为独立的模块,便于维护和扩展:

modules/
├── auth/ // 认证模块
│ ├── login.js
│ └── session.js
├── navigation/ // 导航模块
│ ├── page-loader.js
│ └── url-manager.js
├── interaction/ // 交互模块
│ ├── form-handler.js
│ └── element-locator.js
└── data/ // 数据处理模块
├── extractor.js
└── validator.js

核心技术实现#

1. 浏览器自动化#

Puppeteer高级配置#

const puppeteer = require('puppeteer');
const browser = await puppeteer.launch({
headless: 'new',
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-web-security',
'--disable-features=TranslateUI',
'--disable-ipc-flooding-protection'
],
ignoreHTTPSErrors: true,
defaultViewport: null,
userDataDir: './user_data'
});

多页面管理#

class PageManager {
constructor(browser) {
this.browser = browser;
this.pages = new Map();
}
async createPage(name, options = {}) {
const page = await this.browser.newPage();
page.setDefaultTimeout(options.timeout || 30000);
this.pages.set(name, page);
return page;
}
getPage(name) {
return this.pages.get(name);
}
async closeAll() {
for (const [name, page] of this.pages) {
try {
await page.close();
} catch (error) {
console.warn(`关闭页面 ${name} 时出错:`, error.message);
}
}
this.pages.clear();
}
}

2. 智能元素定位#

多重定位策略#

class ElementLocator {
constructor(page) {
this.page = page;
this.strategies = [
'id',
'css',
'xpath',
'text',
'name',
'label'
];
}
async findElement(selectors) {
for (const strategy of this.strategies) {
try {
const element = await this.tryLocate(selectors, strategy);
if (element) {
return element;
}
} catch (error) {
continue;
}
}
throw new Error(`无法定位元素: ${JSON.stringify(selectors)}`);
}
async tryLocate(selectors, strategy) {
const selector = this.buildSelector(selectors, strategy);
switch (strategy) {
case 'id':
return this.page.$(`#${selector}`);
case 'css':
return this.page.$(selector);
case 'xpath':
return this.page.$(`xpath=${selector}`);
case 'text':
return this.page.$(`text=${selector}`);
default:
return null;
}
}
buildSelector(selectors, strategy) {
if (selectors[strategy]) {
return selectors[strategy];
}
return selectors.text || selectors.name;
}
}

3. 错误处理与恢复机制#

全局错误处理#

class ErrorHandler {
constructor() {
this.retryCount = 3;
this.retryDelay = 2000;
this.maxRetryCount = 5;
}
async executeWithRetry(fn, context = {}) {
let attempts = 0;
let lastError = null;
while (attempts < this.retryCount) {
try {
return await fn();
} catch (error) {
attempts++;
lastError = error;
console.warn(`执行失败 (尝试 ${attempts}/${this.retryCount}):`, error.message);
if (this.isRecoverable(error)) {
await this.sleep(this.retryDelay);
await this.recoveryAction(context);
} else {
throw error;
}
}
}
throw new Error(`达到最大重试次数 (${this.retryCount})`, { cause: lastError });
}
isRecoverable(error) {
const recoverableErrors = [
'TIMEOUT',
'ELEMENT_NOT_INTERACTABLE',
'NETWORK_ERROR',
'CONNECTION_REFUSED'
];
return recoverableErrors.some(type => error.message.includes(type));
}
async recoveryAction(context) {
if (context.page) {
await context.page.reload();
}
if (context.session) {
await context.session.refresh();
}
}
sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
}

4. 性能优化#

缓存机制#

class CacheManager {
constructor(cacheTTL = 3600000) { // 默认1小时
this.cache = new Map();
this.ttl = cacheTTL;
}
get(key) {
const item = this.cache.get(key);
if (item && Date.now() - item.timestamp < this.ttl) {
return item.data;
}
return null;
}
set(key, data) {
this.cache.set(key, {
data,
timestamp: Date.now()
});
}
clear() {
this.cache.clear();
}
cleanup() {
const now = Date.now();
for (const [key, item] of this.cache.entries()) {
if (now - item.timestamp > this.ttl) {
this.cache.delete(key);
}
}
}
}

并发控制#

class ConcurrentExecutor {
constructor(maxConcurrency = 3) {
this.maxConcurrency = maxConcurrency;
this.running = new Set();
this.queue = [];
}
async addTask(task) {
return new Promise((resolve, reject) => {
this.queue.push({ task, resolve, reject });
this.processQueue();
});
}
async processQueue() {
while (this.running.size < this.maxConcurrency && this.queue.length > 0) {
const { task, resolve, reject } = this.queue.shift();
const promise = task();
this.running.add(promise);
promise
.then(result => {
this.running.delete(promise);
resolve(result);
this.processQueue();
})
.catch(error => {
this.running.delete(promise);
reject(error);
this.processQueue();
});
}
}
}

监控与日志#

1. 结构化日志#

class Logger {
constructor() {
this.levels = {
error: 0,
warn: 1,
info: 2,
debug: 3
};
this.currentLevel = 'info';
}
log(level, message, data = {}) {
if (this.levels[level] <= this.levels[this.currentLevel]) {
const logEntry = {
timestamp: new Date().toISOString(),
level,
message,
data
};
console.log(JSON.stringify(logEntry));
// 也可写入文件或发送到监控系统
if (process.env.NODE_ENV === 'production') {
this.writeToFile(logEntry);
}
}
}
error(message, data) {
this.log('error', message, data);
}
warn(message, data) {
this.log('warn', message, data);
}
info(message, data) {
this.log('info', message, data);
}
debug(message, data) {
this.log('debug', message, data);
}
writeToFile(logEntry) {
// 实现文件写入逻辑
}
}

2. 性能监控#

class PerformanceMonitor {
constructor() {
this.metrics = {
requestTimes: [],
errorCounts: {},
successRates: []
};
}
startTimer(operation) {
return {
operation,
startTime: Date.now()
};
}
endTimer(timer) {
const duration = Date.now() - timer.startTime;
this.metrics.requestTimes.push({
operation: timer.operation,
duration,
timestamp: Date.now()
});
return duration;
}
getMetrics() {
return {
...this.metrics,
averageResponseTime: this.calculateAverageTime(),
errorRate: this.calculateErrorRate()
};
}
calculateAverageTime() {
if (this.metrics.requestTimes.length === 0) return 0;
const totalTime = this.metrics.requestTimes.reduce(
(sum, item) => sum + item.duration, 0
);
return totalTime / this.metrics.requestTimes.length;
}
calculateErrorRate() {
// 计算错误率逻辑
return 0; // 示例值
}
}

部署策略#

1. Docker化部署#

FROM node:18-alpine
# 安装Chrome
RUN apk add --no-cache \
chromium \
nss \
freetype \
harfbuzz \
ca-certificates \
ttf-freefont
# 设置Chrome路径
ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser
# 设置工作目录
WORKDIR /app
# 复制package文件
COPY package*.json ./
# 安装依赖
RUN npm ci --only=production
# 复制应用代码
COPY . .
# 构建应用
RUN npm run build
# 暴露端口
EXPOSE 3000
# 启动应用
CMD ["node", "dist/app.js"]

2. 环境变量配置#

config.js
module.exports = {
database: {
host: process.env.DB_HOST || 'localhost',
port: process.env.DB_PORT || 5432,
username: process.env.DB_USERNAME,
password: process.env.DB_PASSWORD
},
automation: {
maxRetries: parseInt(process.env.MAX_RETRIES) || 3,
timeout: parseInt(process.env.TIMEOUT) || 30000,
headless: process.env.HEADLESS !== 'false'
},
logging: {
level: process.env.LOG_LEVEL || 'info',
file: process.env.LOG_FILE || 'automation.log'
}
};

实际应用案例#

1. 电商价格监控#

class PriceMonitor {
constructor(url, selectors) {
this.url = url;
this.selectors = selectors;
this.priceHistory = [];
}
async getCurrentPrice() {
const browser = await puppeteer.launch({ headless: 'new' });
const page = await browser.newPage();
try {
await page.goto(this.url, { waitUntil: 'networkidle2' });
await page.waitForSelector(this.selectors.price);
const priceElement = await page.$(this.selectors.price);
const priceText = await priceElement.evaluate(el => el.textContent);
const price = this.extractPrice(priceText);
this.priceHistory.push({
timestamp: new Date(),
price
});
return price;
} finally {
await browser.close();
}
}
extractPrice(text) {
// 价格提取逻辑
const match = text.match(/(\d+\.?\d*)/);
return match ? parseFloat(match[1]) : null;
}
getPriceHistory() {
return this.priceHistory;
}
}

2. 社交媒体数据采集#

class SocialMediaScraper {
constructor(platform) {
this.platform = platform;
this.userAgent = this.getUserAgent();
}
async scrapeProfile(profileUrl) {
const browser = await puppeteer.launch({
headless: 'new',
args: [`--user-agent=${this.userAgent}`]
});
const page = await browser.newPage();
try {
await page.goto(profileUrl, {
waitUntil: 'networkidle2',
timeout: 60000
});
const data = await this.extractProfileData(page);
return {
profile: data,
timestamp: new Date()
};
} finally {
await browser.close();
}
}
async extractProfileData(page) {
// 数据提取逻辑
return {
followers: await this.getFollowersCount(page),
posts: await this.getPostsCount(page),
bio: await this.getBio(page)
};
}
getUserAgent() {
const userAgents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
];
return userAgents[Math.floor(Math.random() * userAgents.length)];
}
}

性能优化技巧#

1. 并发处理#

async function processMultipleUrls(urls, batchSize = 3) {
const results = [];
const batches = [];
// 将URL分批
for (let i = 0; i < urls.length; i += batchSize) {
batches.push(urls.slice(i, i + batchSize));
}
// 并行处理每批
for (const batch of batches) {
const batchPromises = batch.map(async (url) => {
return await processUrl(url);
});
const batchResults = await Promise.all(batchPromises);
results.push(...batchResults);
}
return results;
}

2. 智能等待策略#

class SmartWaiter {
constructor(page) {
this.page = page;
this.defaultTimeout = 30000;
this.maxTimeout = 120000;
}
async waitForElement(selector, options = {}) {
const startTime = Date.now();
let timeout = options.timeout || this.defaultTimeout;
return new Promise(async (resolve, reject) => {
try {
await this.page.waitForSelector(selector, {
timeout,
visible: options.visible || true
});
resolve(await this.page.$(selector));
} catch (error) {
if (error.name === 'TimeoutError') {
const elapsed = Date.now() - startTime;
if (elapsed < this.maxTimeout) {
// 尝试恢复策略
await this.recoveryStrategy();
// 重试
return this.waitForElement(selector, {
timeout: Math.min(timeout * 1.5, this.maxTimeout),
...options
}).then(resolve).catch(reject);
}
}
reject(error);
}
});
}
async recoveryStrategy() {
// 恢复策略:刷新页面、重新登录等
try {
await this.page.reload({ waitUntil: 'networkidle2' });
} catch (error) {
console.warn('恢复策略失败:', error.message);
}
}
}

总结#

构建生产级Web自动化解决方案需要综合考虑架构设计、错误处理、性能优化等多个方面。通过模块化设计、智能定位策略、完善的错误处理机制和性能优化手段,可以打造出稳定、高效的自动化解决方案。

最重要的是,持续的监控和改进是确保自动化系统长期稳定运行的关键。通过收集运行数据、分析性能指标,不断优化和完善系统,才能适应不断变化的环境需求。


本文首发于Fuwari博客,欢迎交流讨论