Created by: sebastienfi
Quick PR to expose Page's requestfinished
event.
Temporary fix for https://github.com/yujiosaka/headless-chrome-crawler/issues/93 @yujiosaka your solution expose the whole page is way better.
This scenario allow to intercept the request and do something with the response without having to download the ressource a second time, which is, the official way of doing this.
const HCCrawler = require('headless-chrome-crawler')
var fs = require('fs')
HCCrawler.launch({
// Function to be evaluated in browsers
evaluatePage: (() => ({
title: $('title').text()
})),
// Function to be called with evaluated results from browsers
onSuccess: (result => {
console.log(result)
}),
})
.then(crawler => {
crawler.on('pagerequestfinished', async request => {
const url = request.url()
// Matches images.
const match = /.*\.(jpg|png|gif)$/.exec(url)
if (match && match.length === 2) {
const split = url.split('/')
const filename = split[split.length - 1]
const response = request.response()
const buffer = await response.buffer()
fs.writeFileSync(`./tmp/${filename}`, buffer, 'base64')
}
})
crawler.queue({
maxDepth: 1,
skipDuplicates: true,
url: `http://www.example.com`
})
crawler.onIdle() // Resolved when no queue is left
.then(() => crawler.close()) // Close the crawler
})