Skip to content

Commit

Permalink
Extract stream helpers and WalkHTML methods
Browse files Browse the repository at this point in the history
  • Loading branch information
Kikobeats committed Mar 25, 2016
1 parent dd1c2fb commit 6f73cac
Show file tree
Hide file tree
Showing 2 changed files with 132 additions and 125 deletions.
199 changes: 74 additions & 125 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,13 @@ var isUrl = require('is-url')
var isArray = Array.isArray
var fs = require('fs')

function handleStreamError (stream, fn) {
fn(function (err) {
if (err) stream.emit('error', err)
})
}

/**
* Locals
*/

var absolutes = require('./lib/absolutes')
var resolve = require('./lib/resolve')
var streamHelper = require('./lib/stream')
var params = require('./lib/params')
var walk = require('./lib/walk')

Expand Down Expand Up @@ -69,6 +64,9 @@ function Xray () {
var pages = []
var stream

var walkHTML = WalkHTML(xray, selector, scope)
var request = Request(crawler)

function node (source2, fn) {
if (arguments.length === 1) {
fn = source2
Expand All @@ -84,10 +82,10 @@ function Xray () {

if (isUrl(source)) {
debug('starting at: %s', source)
xray.request(source, function (err, html) {
request(source, function (err, html) {
if (err) return next(err)
var $ = load(html, source)
node.html($, next)
walkHTML($, next)
})
} else if (scope && ~scope.indexOf('@')) {
debug('resolving to a url: %s', scope)
Expand All @@ -96,21 +94,21 @@ function Xray () {
// ensure that a@href is a URL
if (!isUrl(url)) {
debug('%s is not a url. Skipping!', url)
return node.html(load(''), next)
return walkHTML(load(''), next)
}

debug('resolved "%s" to a %s', scope, url)
xray.request(url, function (err, html) {
request(url, function (err, html) {
if (err) return next(err)
var $ = load(html, url)
node.html($, next)
walkHTML($, next)
})
} else if (source) {
var $ = load(source)
node.html($, next)
walkHTML($, next)
} else {
debug('%s is not a url or html. Skipping!', source)
return node.html(load(''), next)
return walkHTML(load(''), next)
}

function next (err, obj, $) {
Expand All @@ -120,8 +118,8 @@ function Xray () {

// create the stream
if (!stream) {
if (paginate) stream = stream_array(state.stream)
else stream = stream_object(state.stream)
if (paginate) stream = streamHelper.array(state.stream)
else stream = streamHelper.object(state.stream)
}

if (paginate) {
Expand Down Expand Up @@ -152,10 +150,10 @@ function Xray () {
debug('paginating %j', url)
isFinite(limit) && debug('%s page(s) left to crawl', limit)

xray.request(url, function (err, html) {
request(url, function (err, html) {
if (err) return next(err)
var $ = load(html, url)
node.html($, next)
walkHTML($, next)
})
} else {
stream(obj, true)
Expand All @@ -166,54 +164,6 @@ function Xray () {
return node
}

function load (html, url) {
html = html || ''

This comment has been minimized.

Copy link
@gconnolly

gconnolly Mar 30, 2016

Contributor

@Kikobeats Was this line intentionally removed? I was added as a part of 61738eb and now I am getting exceptions again.

This comment has been minimized.

Copy link
@Kikobeats

Kikobeats Mar 30, 2016

Author Contributor

I think that was removed automatically in your PR under a merge command

This comment has been minimized.

Copy link
@gconnolly

gconnolly Mar 30, 2016

Contributor

I don't see this being removed in a merge commit. I will create a PR to fix it.

This comment has been minimized.

Copy link
@gconnolly

gconnolly Mar 30, 2016

Contributor

Oh, man. I don't know how I missed this: f73990f. Thanks @Kikobeats @nicobevilacqua

This comment has been minimized.

Copy link
@Kikobeats

Kikobeats Mar 30, 2016

Author Contributor

Yep, but I released a minor instead of a patch :'(

var $ = html.html ? html : cheerio.load(html)
if (url) $ = absolutes(url, $)
return $
}

node.html = function ($, fn) {
walk(selector, function (v, k, next) {
if (typeof v === 'string') {
var value = resolve($, root(scope), v)
return next(null, value)
} else if (typeof v === 'function') {
return v($, function (err, obj) {
if (err) return next(err)
return next(null, obj)
})
} else if (isArray(v)) {
if (typeof v[0] === 'string') {
return next(null, resolve($, root(scope), v))
} else if (typeof v[0] === 'object') {
var $scope = $.find ? $.find(scope) : $(scope)
var pending = $scope.length
var out = []

// Handle the empty result set (thanks @jenbennings!)
if (!pending) return next(null, out)

$scope.each(function (i, el) {
var $innerscope = $scope.eq(i)
var node = xray(scope, v[0])
node($innerscope, function (err, obj) {
if (err) return next(err)
out[i] = obj
if (!--pending) {
return next(null, compact(out))
}
})
})
}
}
return next()
}, function (err, obj) {
if (err) return fn(err)
fn(null, obj, $)
})
}

node.paginate = function (paginate) {
if (!arguments.length) return state.paginate
state.paginate = paginate
Expand All @@ -229,29 +179,20 @@ function Xray () {
node.stream = function () {
state.stream = store.createWriteStream()
var rs = store.createReadStream()
handleStreamError(rs, node)
streamHelper.waitCb(rs, node)
return rs
}

node.write = function (path) {
if (!arguments.length) return node.stream()
state.stream = fs.createWriteStream(path)
handleStreamError(state.stream, node)
streamHelper.waitCb(state.stream, node)
return state.stream
}

return node
}

xray.request = function (url, fn) {
debug('fetching %s', url)
crawler(url, function (err, ctx) {
if (err) return fn(err)
debug('got response for %s with status code: %s', url, ctx.status)
return fn(null, ctx.body)
})
}

methods.forEach(function (method) {
xray[method] = function () {
if (!arguments.length) return crawler[method]()
Expand All @@ -263,13 +204,29 @@ function Xray () {
return xray
}

function Request (crawler) {
return function request (url, fn) {
debug('fetching %s', url)
crawler(url, function (err, ctx) {
if (err) return fn(err)
debug('got response for %s with status code: %s', url, ctx.status)
return fn(null, ctx.body)
})
}
}

function load (html, url) {
var $ = html.html ? html : cheerio.load(html)

This comment has been minimized.

Copy link
@gconnolly

gconnolly Mar 30, 2016

Contributor

@Kikobeats This is where the line is missing.

if (url) $ = absolutes(url, $)
return $
}

/**
* Get the root, if there is one.
*
* @param {Mixed}
* @return {Boolean|String}
*/

function root (selector) {
return (typeof selector === 'string' || isArray(selector)) &&
!~selector.indexOf('@') &&
Expand All @@ -294,53 +251,45 @@ function compact (arr) {
})
}

/**
* Streaming array helper
*
* @param {Stream} data (optional)
*/

function stream_array (stream) {
if (!stream) return function () {}
var first = true

return function _stream_array (data, end) {
var json = JSON.stringify(data, true, 2)

if (first) {
stream.write('[\n')
first = false
}

if (isArray(data)) {
json = json.slice(1, -1)
}

if (end) {
stream.end(json + ']')
} else {
stream.write(json + ',')
}
}
}

/**
* Streaming object helper
*
* @param {Stream} data (optional)
* @return {Function}
*/

function stream_object (stream) {
if (!stream) return function () {}

return function _stream_object (data, end) {
var json = JSON.stringify(data, true, 2)

if (end) {
stream.end(json)
} else {
stream.write(json)
}
function WalkHTML (xray, selector, scope) {
return function _walkHTML ($, fn) {
walk(selector, function (v, k, next) {
if (typeof v === 'string') {
var value = resolve($, root(scope), v)
return next(null, value)
} else if (typeof v === 'function') {
return v($, function (err, obj) {
if (err) return next(err)
return next(null, obj)
})
} else if (isArray(v)) {
if (typeof v[0] === 'string') {
return next(null, resolve($, root(scope), v))
} else if (typeof v[0] === 'object') {
var $scope = $.find ? $.find(scope) : $(scope)
var pending = $scope.length
var out = []

// Handle the empty result set (thanks @jenbennings!)
if (!pending) return next(null, out)

$scope.each(function (i, el) {
var $innerscope = $scope.eq(i)
var node = xray(scope, v[0])
node($innerscope, function (err, obj) {
if (err) return next(err)
out[i] = obj
if (!--pending) {
return next(null, compact(out))
}
})
})
}
}
return next()
}, function (err, obj) {
if (err) return fn(err)
fn(null, obj, $)
})
}
}
58 changes: 58 additions & 0 deletions lib/stream.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
var isArray = Array.isArray

module.exports = {
/**
* Streaming array helper
*
* @param {Stream} data (optional)
*/
array: function stream_array (stream) {
if (!stream) return function () {}
var first = true

return function _stream_array (data, end) {
var json = JSON.stringify(data, true, 2)

if (first) {
stream.write('[\n')
first = false
}

if (isArray(data)) {
json = json.slice(1, -1)
}

if (end) {
stream.end(json + ']')
} else {
stream.write(json + ',')
}
}
},

/**
* Streaming object helper
*
* @param {Stream} data (optional)
* @return {Function}
*/
object: function stream_object (stream) {
if (!stream) return function () {}

return function _stream_object (data, end) {
var json = JSON.stringify(data, true, 2)

if (end) {
stream.end(json)
} else {
stream.write(json)
}
}
},

waitCb: function stream_callback (stream, fn) {
fn(function (err) {
if (err) stream.emit('error', err)
})
}
}

0 comments on commit 6f73cac

Please sign in to comment.