Crawler – AWS Lambda 中实现爬虫
     发布在:JavaScript      浏览:5      评论:0 条评论

继上一篇: 《Nodejs – AWS Lambda 中进行页面截图比较》

基于能用代码就少用人力的前提,还是用爬虫爬站点 URL 方便点。

'use strict';

const Crawler = require('simplecrawler');
const AwsLambdaHttpResponse = require('aws-lambda-http-response');
const urljoin = require('url-join');

exports.handler = (event, context, callback) => {
  if (!exports.validateParameters(event)) {
    return new AwsLambdaHttpResponse({callback}).error({
      body: 'Parameters required: host, path.'
    });
  }

  context.callbackWaitsForEmptyEventLoop = false;
  return exports
      .run(event)
      .then((result) => {
        return new AwsLambdaHttpResponse({callback}).success({
          body: result
        });
      })
      .catch((err) => {
        return new AwsLambdaHttpResponse({callback}).error({
          body: err
        });
      });
};

/**
 * @name run
 * @description crawler function
 * @param {Object} event
 * {
 *    host: 'http://www.baidu.com',
 *    path: '/',
 *    interval: 250,
 *    maxConcurrency: 20,
 *    maxDepth: 2;
 *    needsAuth: false,
 *    user: '',
 *    password: ''
 * }
 * @return {Promise}
 */
exports.run = function run(event) {
  const paths = [];
  return new Promise((resolve, reject) => {
    const url = urljoin(event.host, event.path);
    console.log('url', url);

    const crawler = new Crawler(url);

    // Crawler Setting
    crawler.interval = event.interval ?
                       event.interval : process.env.CRAWLER_INTERVAL;
    crawler.maxConcurrency = event.maxConcurrency ?
                             event.maxConcurrency : process.env.CRAWLER_MAX_CONCURRENCY;
    crawler.maxDepth = event.maxDepth ?
                       event.maxDepth : process.env.CRAWLER_MAX_DEPTH;
    if (event.needsAuth) {
      crawler.needsAuth = true;
      crawler.authUser = event.user;
      crawler.authPass = event.password;
    }

    crawler.addFetchCondition((queueItem, referrerQueueItem, callback) => {
      const file = /\.(pdf|png|jpg|jpeg|svg|gif|ico|js|css|mp4)$/i;
      callback(null, !queueItem.path.match(file));
    });

    crawler.on('crawlstart', () => {
      console.log('Crawler start');
    });

    crawler.on('fetchcomplete', (queueItem, responseBuffer, response) => {
      if (!paths.includes(queueItem.path)) paths.push(queueItem.path);
    });

    crawler.on('complete', () => {
      console.log('Crawler end.');
      console.log('paths', paths);
      return resolve(paths);
    });

    crawler.start();
  });
};

exports.validateParameters = function validateParameters(params) {
  if (!params.host || !params.path) {
    return false;
  }
  return true;
};
    Responses