I have a user database in mongodb which I would like to export via a REST interface in JSON. The problem is that in the worst case scenario the amount of returned rows is well over 2 million.
First I tried this
var mongo = require('mongodb'),
  Server = mongo.Server,
  Db = mongo.Db;
var server = new Server('localhost', 27017, {auto_reconnect: true});
var db = new Db('tracking', server);
var http = require('http');
http.createServer(function (request, response) {
  db.collection('users', function(err, collection) {
    collection.find({}, function(err, cursor){
      cursor.toArray(function(err, items) {
        output = '{"users" : ' + JSON.stringify(items) + '}';
        response.setHeader("Content-Type", "application/json");
        response.end(output);
      });
    });
  });
}).listen(8008);
console.log('Server running at localhost:8008');
which fails when running out of memory. The example uses node-mongodb-native driver and the basic http package.
FATAL ERROR: CALL_AND_RETRY_2 Allocation failed - process out of memory
(note that in real scenario I use parameters which limit the results as needed, but this example queries them all which is the worst case scenario regardless)
The data itself is simple, like
{ "_id" : ObjectId("4f993d1c5656d3320851aadb"), "userid" : "80ec39f7-37e2-4b13-b442-6bea57472537", "user-agent" : "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322)", "ip" : "127.0.0.1", "lastupdate" : 1335442716 }
I also tried something like
while(cursor != null)
{
  cursor.nextObject(function(err, item) {
    response.write(JSON.stringify(item));
  });
}
but that ran out of memory too.
How should I proceed? There should be a way to stream the data row by row, but I haven't been able to find a suitable example for it. Paging the data is out of the question because of external application requirements. I thought of writing the data to a file and then posting it, but that leads to unwanted io.
The cursor.streamRecords() method of the native MongoDB driver is deprecated, 
the method stream() is faster.
I have parsed a 40,000,000 row document of acatalog without problems with Mongodb + stream() + process.nextTick()
I found out that node-mongodb-native Cursor object has a streaming option (used with collection.find().streamRecords()) for the records too even if it's not mentioned in the github page of the driver. See the Cursor source code and search for "streamRecords".
In the end the code ended up like this:
db.collection('users', function(err, collection) {
  var first = true;
  response.setHeader("Content-Type", "application/json");
  response.write('{"users" : [');
  var stream = collection.find().streamRecords();
  stream.on('data', function(item) {
    var prefix = first ? '' : ', ';
    response.write(prefix + JSON.stringify(item));
    first = false;
  });
  stream.on('end', function() {
    response.write(']}');
    response.end();
  });
});
Something like that should work. If it doesn't you should probably open an issue in the mongodb-native bug tracker.
http.createServer(function (request, response) {
  db.collection('users', function(err, collection) {
    collection.find({}, function(err, cursor){
      response.setHeader("Content-Type", "application/json");
      cursor.each(function(err, item) {
        if (item) {
          response.write(JSON.stringify(item));
        } else {
          response.end();
        }
      });
    });
  });
}).listen(8008);
PS: it's just a stub, i mean i dont remember the exact syntax, but it's each function you're looking for.
Well, I no longer use mongodb native javascript driver, but in mongoose there is pretty good implementation of streams.
The syntax of the two drivers is pretty similar. You can do this with mongoose :
response.setHeader("Content-Type", "application/json");
var stream = collection.find().stream();
stream.on('data', function(doc) {
   response.write(doc);  
});
stream.on('close', function() {
   response.end();
});
A little module to do that using Node's stream.Transform class:
var stream = require('stream');
function createCursorStream(){
    var cursorStream = new stream.Transform({objectMode:true});
    cursorStream._transform = function(chunk,encoding,done){
        if(cursorStream.started){
            cursorStream.push(', ' + JSON.stringify(chunk));
        }else{
            cursorStream.push('[' + JSON.stringify(chunk));
            cursorStream.started = true;
        }
        done();
    };
    cursorStream._flush = function(done){
        cursorStream.push(']');
        done();
    };
    return cursorStream;
}
module.exports.streamCursorToResponse = function(cursor,response){
    cursor.stream().pipe(createCursorStream()).pipe(response);
};
You can alter JSON.Stringify parts to do any other kind of "on the fly" transforms on the objects coming from mongodb cursor, and save some memory.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With