2009-10-09 20:55:29 +02:00
|
|
|
// import.cpp
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Copyright (C) 2008 10gen Inc.
|
|
|
|
*
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU Affero General Public License, version 3,
|
|
|
|
* as published by the Free Software Foundation.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU Affero General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Affero General Public License
|
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "stdafx.h"
|
|
|
|
#include "client/dbclient.h"
|
|
|
|
#include "db/json.h"
|
|
|
|
|
|
|
|
#include "tool.h"
|
|
|
|
|
|
|
|
#include <fstream>
|
|
|
|
#include <iostream>
|
|
|
|
|
|
|
|
#include <boost/program_options.hpp>
|
|
|
|
|
|
|
|
using namespace mongo;
|
|
|
|
|
|
|
|
namespace po = boost::program_options;
|
|
|
|
|
|
|
|
class Import : public Tool {
|
|
|
|
|
|
|
|
enum Type { JSON , CSV , TSV };
|
|
|
|
Type _type;
|
|
|
|
|
|
|
|
const char * _sep;
|
2009-10-12 21:24:38 +02:00
|
|
|
bool _ignoreBlanks;
|
2009-11-28 23:29:23 +01:00
|
|
|
bool _headerLine;
|
|
|
|
|
2009-10-09 20:55:29 +02:00
|
|
|
void _append( BSONObjBuilder& b , const string& fieldName , const string& data ){
|
2009-11-14 03:07:15 +01:00
|
|
|
if ( b.appendAsNumber( fieldName , data ) )
|
2009-10-09 20:55:29 +02:00
|
|
|
return;
|
|
|
|
|
2009-10-12 21:24:38 +02:00
|
|
|
if ( _ignoreBlanks && data.size() == 0 )
|
|
|
|
return;
|
|
|
|
|
2009-10-09 20:55:29 +02:00
|
|
|
// TODO: other types?
|
|
|
|
b.append( fieldName.c_str() , data );
|
|
|
|
}
|
|
|
|
|
2009-11-02 15:52:58 +01:00
|
|
|
BSONObj parseLine( char * line ){
|
|
|
|
if ( _type == JSON ){
|
|
|
|
char * end = ( line + strlen( line ) ) - 1;
|
|
|
|
while ( isspace(*end) ){
|
|
|
|
*end = 0;
|
|
|
|
end--;
|
|
|
|
}
|
2009-10-09 20:55:29 +02:00
|
|
|
return fromjson( line );
|
2009-11-02 15:52:58 +01:00
|
|
|
}
|
2009-10-09 20:55:29 +02:00
|
|
|
|
|
|
|
BSONObjBuilder b;
|
|
|
|
|
|
|
|
unsigned int pos=0;
|
|
|
|
while ( line[0] ){
|
|
|
|
string name;
|
|
|
|
if ( pos < _fields.size() ){
|
|
|
|
name = _fields[pos];
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
stringstream ss;
|
|
|
|
ss << "field" << pos;
|
|
|
|
name = ss.str();
|
|
|
|
}
|
|
|
|
pos++;
|
2009-11-28 17:19:51 +01:00
|
|
|
|
2010-03-22 18:52:21 +01:00
|
|
|
bool done = false;
|
|
|
|
string data;
|
2009-11-28 17:19:51 +01:00
|
|
|
char * end;
|
|
|
|
if ( _type == CSV && line[0] == '"' ){
|
2010-03-22 18:52:21 +01:00
|
|
|
line++; //skip first '"'
|
|
|
|
|
|
|
|
while (true) {
|
|
|
|
end = strchr( line , '"' );
|
|
|
|
if (!end){
|
|
|
|
data += line;
|
|
|
|
done = true;
|
|
|
|
break;
|
|
|
|
} else if (end[1] == '"') {
|
|
|
|
// two '"'s get appended as one
|
|
|
|
data.append(line, end-line+1); //include '"'
|
|
|
|
line = end+2; //skip both '"'s
|
|
|
|
} else if (end[-1] == '\\') {
|
|
|
|
// "\\\"" gets appended as '"'
|
|
|
|
data.append(line, end-line-1); //exclude '\\'
|
|
|
|
data.append("\"");
|
|
|
|
line = end+1; //skip the '"'
|
|
|
|
} else {
|
|
|
|
data.append(line, end-line);
|
|
|
|
line = end+2; //skip '"' and ','
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
2009-11-28 17:19:51 +01:00
|
|
|
end = strstr( line , _sep );
|
2010-03-22 18:52:21 +01:00
|
|
|
if ( ! end ){
|
|
|
|
done = true;
|
|
|
|
data = string( line );
|
|
|
|
} else {
|
|
|
|
data = string( line , end - line );
|
|
|
|
line = end+1;
|
|
|
|
}
|
2009-11-28 17:19:51 +01:00
|
|
|
}
|
2009-11-28 23:29:23 +01:00
|
|
|
|
2009-12-22 20:16:48 +01:00
|
|
|
if ( _headerLine ){
|
|
|
|
while ( isspace( data[0] ) )
|
|
|
|
data = data.substr( 1 );
|
2009-11-28 23:29:23 +01:00
|
|
|
_fields.push_back( data );
|
2009-12-22 20:16:48 +01:00
|
|
|
}
|
2009-11-28 23:29:23 +01:00
|
|
|
else
|
|
|
|
_append( b , name , data );
|
2009-10-09 20:55:29 +02:00
|
|
|
|
2009-11-28 23:29:23 +01:00
|
|
|
if ( done )
|
|
|
|
break;
|
2009-10-09 20:55:29 +02:00
|
|
|
}
|
|
|
|
return b.obj();
|
|
|
|
}
|
|
|
|
|
|
|
|
public:
|
|
|
|
Import() : Tool( "import" ){
|
2009-10-12 21:05:42 +02:00
|
|
|
addFieldOptions();
|
2009-10-09 20:55:29 +02:00
|
|
|
add_options()
|
2009-10-12 21:24:38 +02:00
|
|
|
("ignoreBlanks","if given, empty fields in csv and tsv will be ignored")
|
2009-10-09 20:55:29 +02:00
|
|
|
("type",po::value<string>() , "type of file to import. default: json (json,csv,tsv)")
|
|
|
|
("file",po::value<string>() , "file to import from; if not specified stdin is used" )
|
|
|
|
("drop", "drop collection first " )
|
2009-11-28 23:29:23 +01:00
|
|
|
("headerline","CSV,TSV only - use first line as headers")
|
2009-10-09 20:55:29 +02:00
|
|
|
;
|
|
|
|
addPositionArg( "file" , 1 );
|
|
|
|
_type = JSON;
|
2009-10-12 21:24:38 +02:00
|
|
|
_ignoreBlanks = false;
|
2009-11-28 23:43:33 +01:00
|
|
|
_headerLine = false;
|
2009-10-09 20:55:29 +02:00
|
|
|
}
|
2009-10-12 21:24:38 +02:00
|
|
|
|
2009-10-09 20:55:29 +02:00
|
|
|
int run(){
|
|
|
|
string filename = getParam( "file" );
|
|
|
|
long long fileSize = -1;
|
|
|
|
|
|
|
|
istream * in = &cin;
|
|
|
|
|
2010-02-04 03:07:44 +01:00
|
|
|
ifstream file( filename.c_str() , ios_base::in);
|
2009-10-09 20:55:29 +02:00
|
|
|
|
|
|
|
if ( filename.size() > 0 && filename != "-" ){
|
|
|
|
if ( ! exists( filename ) ){
|
|
|
|
cerr << "file doesn't exist: " << filename << endl;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
in = &file;
|
|
|
|
fileSize = file_size( filename );
|
|
|
|
}
|
|
|
|
|
|
|
|
string ns;
|
|
|
|
|
|
|
|
try {
|
|
|
|
ns = getNS();
|
|
|
|
} catch (...) {
|
|
|
|
printHelp(cerr);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2009-10-12 02:24:12 +02:00
|
|
|
log(1) << "ns: " << ns << endl;
|
|
|
|
|
2009-10-09 20:55:29 +02:00
|
|
|
auth();
|
|
|
|
|
|
|
|
if ( hasParam( "drop" ) ){
|
|
|
|
cout << "dropping: " << ns << endl;
|
|
|
|
conn().dropCollection( ns.c_str() );
|
|
|
|
}
|
|
|
|
|
2009-10-12 21:24:38 +02:00
|
|
|
if ( hasParam( "ignoreBlanks" ) ){
|
|
|
|
_ignoreBlanks = true;
|
|
|
|
}
|
|
|
|
|
2009-10-09 20:55:29 +02:00
|
|
|
if ( hasParam( "type" ) ){
|
|
|
|
string type = getParam( "type" );
|
|
|
|
if ( type == "json" )
|
|
|
|
_type = JSON;
|
|
|
|
else if ( type == "csv" ){
|
|
|
|
_type = CSV;
|
|
|
|
_sep = ",";
|
|
|
|
}
|
|
|
|
else if ( type == "tsv" ){
|
|
|
|
_type = TSV;
|
|
|
|
_sep = "\t";
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
cerr << "don't know what type [" << type << "] is" << endl;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( _type == CSV || _type == TSV ){
|
2009-11-28 23:29:23 +01:00
|
|
|
_headerLine = hasParam( "headerline" );
|
|
|
|
if ( ! _headerLine )
|
|
|
|
needFields();
|
2009-10-09 20:55:29 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
int errors = 0;
|
|
|
|
|
|
|
|
int num = 0;
|
|
|
|
|
|
|
|
time_t start = time(0);
|
|
|
|
|
2009-10-12 02:24:12 +02:00
|
|
|
log(1) << "filesize: " << fileSize << endl;
|
2009-10-09 20:55:29 +02:00
|
|
|
ProgressMeter pm( fileSize );
|
|
|
|
const int BUF_SIZE = 1024 * 1024 * 4;
|
2010-02-28 05:05:40 +01:00
|
|
|
boost::scoped_array<char> line(new char[BUF_SIZE+2]);
|
2009-10-09 20:55:29 +02:00
|
|
|
while ( *in ){
|
2009-12-29 00:27:18 +01:00
|
|
|
char * buf = line.get();
|
|
|
|
in->getline( buf , BUF_SIZE );
|
2009-12-28 22:43:43 +01:00
|
|
|
uassert( 10263 , "unknown error reading file" , ( in->rdstate() & ios_base::badbit ) == 0 );
|
2009-12-29 00:27:18 +01:00
|
|
|
log(1) << "got line:" << buf << endl;
|
2009-10-12 02:24:12 +02:00
|
|
|
|
2009-10-09 20:55:29 +02:00
|
|
|
while( isspace( buf[0] ) ) buf++;
|
2009-10-12 02:24:12 +02:00
|
|
|
|
2009-10-09 20:55:29 +02:00
|
|
|
int len = strlen( buf );
|
|
|
|
if ( ! len )
|
|
|
|
continue;
|
|
|
|
|
2010-02-28 05:05:40 +01:00
|
|
|
buf[len+1] = 0;
|
|
|
|
|
2009-10-09 20:55:29 +02:00
|
|
|
if ( in->rdstate() == ios_base::eofbit )
|
|
|
|
break;
|
|
|
|
assert( in->rdstate() == 0 );
|
|
|
|
|
|
|
|
try {
|
|
|
|
BSONObj o = parseLine( buf );
|
2009-11-28 23:29:23 +01:00
|
|
|
if ( _headerLine )
|
|
|
|
_headerLine = false;
|
|
|
|
else
|
|
|
|
conn().insert( ns.c_str() , o );
|
2009-10-09 20:55:29 +02:00
|
|
|
}
|
|
|
|
catch ( std::exception& e ){
|
|
|
|
cout << "exception:" << e.what() << endl;
|
|
|
|
cout << buf << endl;
|
|
|
|
errors++;
|
|
|
|
}
|
|
|
|
|
|
|
|
num++;
|
|
|
|
if ( pm.hit( len + 1 ) ){
|
|
|
|
cout << "\t\t\t" << num << "\t" << ( num / ( time(0) - start ) ) << "/second" << endl;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
cout << "imported " << num << " objects" << endl;
|
2010-02-09 23:50:38 +01:00
|
|
|
|
2010-02-10 00:15:52 +01:00
|
|
|
conn().getLastError();
|
2009-10-09 20:55:29 +02:00
|
|
|
|
|
|
|
if ( errors == 0 )
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
cerr << "encountered " << errors << " error" << ( errors == 1 ? "" : "s" ) << endl;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
int main( int argc , char ** argv ) {
|
|
|
|
Import import;
|
|
|
|
return import.main( argc , argv );
|
|
|
|
}
|