I need to have uploads of csvs that are up to 350MB with around 3,000,000 lines. I only need to check the header on the first line. Is there an efficient way to do this? Using node-csv take a long time because it's parsing the whole thing. I'm using busyboy and have a stream.
I've never had to parse a file so large but maybe you can try a combination of the event-stream and get-line packages:
var es, fs, getLine, getLines;
getLine = require('get-line');
fs = require('fs');
es = require('event-stream');
getLines = getLine({
lines: [1],
encoding: 'utf8'
});
console.time('get first line');
fs.createReadStream('./test.csv', { encoding: 'utf8' })
.pipe(getLines)
.pipe(es.map(function(line, next) {
var data = line.split(',').map(function(c) { return c.trim(); });
/* this will be called for each line, do your stuff here */
console.log(data);
return next(null, line);
})).pipe(es.wait(function(err, body) {
/* this is called after the processing of all lines, if you want to do something more */
/* can be removed if you don't need it */
console.timeEnd('get first line');
}));
Edit: just tried my code with a fat csv from here (the star2002 file, 2GB), here is the result:
[ '1',
'1613423',
'807',
'20011015.2226039991',
'1613424',
'4518',
'0',
'0',
'654',
'1395',
'20011204.1149509996',
'10.955403',
'2288071',
'-0.28820264',
'0.40731233',
'10.559091' ]
get first line: 15ms
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With