0
0
mirror of https://github.com/mongodb/mongo.git synced 2024-12-01 09:32:32 +01:00
mongodb/tools/import.cpp
2011-08-01 18:03:39 -04:00

464 lines
14 KiB
C++

// import.cpp
/**
* Copyright (C) 2008 10gen Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "pch.h"
#include "client/dbclient.h"
#include "db/json.h"
#include "tool.h"
#include "../util/text.h"
#include <fstream>
#include <iostream>
#include <boost/program_options.hpp>
#include <boost/algorithm/string.hpp>
using namespace mongo;
namespace po = boost::program_options;
class Import : public Tool {
enum Type { JSON , CSV , TSV };
Type _type;
const char * _sep;
bool _ignoreBlanks;
bool _headerLine;
bool _upsert;
bool _doimport;
bool _jsonArray;
vector<string> _upsertFields;
static const int BUF_SIZE = 1024 * 1024 * 4;
string trimWhitespace(const string& str) {
if (str.size() == 0) {
return str;
}
size_t begin = 0;
size_t end = str.size() - 1;
while (begin < str.size() && isspace(str[begin])) { ++begin; } // Finds index of first non-whitespace character
while (end > 0 && isspace(str[end])) { --end; } // Finds index of last non-whitespace character
return str.substr(begin, end - begin + 1);
}
void csvTokenizeRow(const string& row, vector<string>& tokens) {
bool inQuotes = false;
bool prevWasQuote = false;
bool tokenQuoted = false;
string curtoken = "";
for (string::const_iterator it = row.begin(); it != row.end(); ++it) {
char element = *it;
if (element == '"') {
if (!inQuotes) {
inQuotes = true;
tokenQuoted = true;
curtoken = "";
} else {
if (prevWasQuote) {
curtoken += "\"";
prevWasQuote = false;
} else {
prevWasQuote = true;
}
}
} else {
if (inQuotes && prevWasQuote) {
inQuotes = false;
prevWasQuote = false;
tokens.push_back(curtoken);
}
if (element == ',' && !inQuotes) {
if (!tokenQuoted) { // If token was quoted, it's already been added
tokens.push_back(trimWhitespace(curtoken));
}
curtoken = "";
tokenQuoted = false;
} else {
curtoken += element;
}
}
}
if (!tokenQuoted || (inQuotes && prevWasQuote)) {
tokens.push_back(trimWhitespace(curtoken));
}
}
void _append( BSONObjBuilder& b , const string& fieldName , const string& data ) {
if ( _ignoreBlanks && data.size() == 0 )
return;
if ( b.appendAsNumber( fieldName , data ) )
return;
// TODO: other types?
b.append ( fieldName , data );
}
/*
* Reads one line from in into buf.
* Returns the number of bytes that should be skipped - the caller should
* increment buf by this amount.
*/
int getLine(istream* in, char* buf) {
if (_jsonArray) {
in->read(buf, BUF_SIZE);
uassert(13295, "JSONArray file too large", (in->rdstate() & ios_base::eofbit));
buf[ in->gcount() ] = '\0';
}
else {
in->getline( buf , BUF_SIZE );
log(1) << "got line:" << buf << endl;
}
uassert( 10263 , "unknown error reading file" ,
(!(in->rdstate() & ios_base::badbit)) &&
(!(in->rdstate() & ios_base::failbit) || (in->rdstate() & ios_base::eofbit)) );
int numBytesSkipped = 0;
if (strncmp("\xEF\xBB\xBF", buf, 3) == 0) { // UTF-8 BOM (notepad is stupid)
buf += 3;
numBytesSkipped += 3;
}
uassert(13289, "Invalid UTF8 character detected", isValidUTF8(buf));
return numBytesSkipped;
}
/*
* Parses a BSON object out of a JSON array.
* Returns number of bytes processed on success and -1 on failure.
*/
int parseJSONArray(char* buf, BSONObj& o) {
int len = 0;
while (buf[0] != '{' && buf[0] != '\0') {
len++;
buf++;
}
if (buf[0] == '\0')
return -1;
int jslen;
o = fromjson(buf, &jslen);
len += jslen;
return len;
}
/*
* Parses one object from the input file. This usually corresponds to one line in the input
* file, unless the file is a CSV and contains a newline within a quoted string entry.
* Returns a true if a BSONObj was successfully created and false if not.
*/
bool parseRow(istream* in, BSONObj& o, int& numBytesRead) {
boost::scoped_array<char> buffer(new char[BUF_SIZE+2]);
char* line = buffer.get();
numBytesRead = getLine(in, line);
line += numBytesRead;
if (line[0] == '\0') {
return false;
}
numBytesRead += strlen( line );
if (_type == JSON) {
// Strip out trailing whitespace
char * end = ( line + strlen( line ) ) - 1;
while ( end >= line && isspace(*end) ) {
*end = 0;
end--;
}
o = fromjson( line );
return true;
}
vector<string> tokens;
if (_type == CSV) {
string row;
bool inside_quotes = false;
size_t last_quote = 0;
while (true) {
string lineStr(line);
// Deal with line breaks in quoted strings
last_quote = lineStr.find_first_of('"');
while (last_quote != string::npos) {
inside_quotes = !inside_quotes;
last_quote = lineStr.find_first_of('"', last_quote+1);
}
row.append(lineStr);
if (inside_quotes) {
row.append("\n");
int num = getLine(in, line);
line += num;
numBytesRead += num;
uassert (15854, "CSV file ends while inside quoted field", line[0] != '\0');
numBytesRead += strlen( line );
} else {
break;
}
}
// now 'row' is string corresponding to one row of the CSV file
// (which may span multiple lines) and represents one BSONObj
csvTokenizeRow(row, tokens);
}
else { // _type == TSV
while (line[0] != '\t' && isspace(line[0])) { // Strip leading whitespace, but not tabs
line++;
}
boost::split(tokens, line, boost::is_any_of(_sep));
}
// Now that the row is tokenized, create a BSONObj out of it.
BSONObjBuilder b;
unsigned int pos=0;
for (vector<string>::iterator it = tokens.begin(); it != tokens.end(); ++it) {
string token = *it;
if ( _headerLine ) {
_fields.push_back(token);
}
else {
string name;
if ( pos < _fields.size() ) {
name = _fields[pos];
}
else {
stringstream ss;
ss << "field" << pos;
name = ss.str();
}
pos++;
_append( b , name , token );
}
}
o = b.obj();
return true;
}
public:
Import() : Tool( "import" ) {
addFieldOptions();
add_options()
("ignoreBlanks","if given, empty fields in csv and tsv will be ignored")
("type",po::value<string>() , "type of file to import. default: json (json,csv,tsv)")
("file",po::value<string>() , "file to import from; if not specified stdin is used" )
("drop", "drop collection first " )
("headerline","CSV,TSV only - use first line as headers")
("upsert", "insert or update objects that already exist" )
("upsertFields", po::value<string>(), "comma-separated fields for the query part of the upsert. You should make sure this is indexed" )
("stopOnError", "stop importing at first error rather than continuing" )
("jsonArray", "load a json array, not one item per line. Currently limited to 4MB." )
;
add_hidden_options()
("noimport", "don't actually import. useful for benchmarking parser" )
;
addPositionArg( "file" , 1 );
_type = JSON;
_ignoreBlanks = false;
_headerLine = false;
_upsert = false;
_doimport = true;
_jsonArray = false;
}
int run() {
string filename = getParam( "file" );
long long fileSize = 0;
istream * in = &cin;
ifstream file( filename.c_str() , ios_base::in);
if ( filename.size() > 0 && filename != "-" ) {
if ( ! exists( filename ) ) {
cerr << "file doesn't exist: " << filename << endl;
return -1;
}
in = &file;
fileSize = file_size( filename );
}
// check if we're actually talking to a machine that can write
if (!isMaster()) {
return -1;
}
string ns;
try {
ns = getNS();
}
catch (...) {
printHelp(cerr);
return -1;
}
log(1) << "ns: " << ns << endl;
auth();
if ( hasParam( "drop" ) ) {
cout << "dropping: " << ns << endl;
conn().dropCollection( ns.c_str() );
}
if ( hasParam( "ignoreBlanks" ) ) {
_ignoreBlanks = true;
}
if ( hasParam( "upsert" ) || hasParam( "upsertFields" )) {
_upsert = true;
string uf = getParam("upsertFields");
if (uf.empty()) {
_upsertFields.push_back("_id");
}
else {
StringSplitter(uf.c_str(), ",").split(_upsertFields);
}
}
if ( hasParam( "noimport" ) ) {
_doimport = false;
}
if ( hasParam( "type" ) ) {
string type = getParam( "type" );
if ( type == "json" )
_type = JSON;
else if ( type == "csv" ) {
_type = CSV;
_sep = ",";
}
else if ( type == "tsv" ) {
_type = TSV;
_sep = "\t";
}
else {
cerr << "don't know what type [" << type << "] is" << endl;
return -1;
}
}
if ( _type == CSV || _type == TSV ) {
_headerLine = hasParam( "headerline" );
if ( ! _headerLine )
needFields();
}
if (_type == JSON && hasParam("jsonArray")) {
_jsonArray = true;
}
time_t start = time(0);
log(1) << "filesize: " << fileSize << endl;
ProgressMeter pm( fileSize );
int num = 0;
int errors = 0;
int len = 0;
// buffer and line are only used when parsing a jsonArray
boost::scoped_array<char> buffer(new char[BUF_SIZE+2]);
char* line = buffer.get();
while ( _jsonArray || in->rdstate() == 0 ) {
try {
BSONObj o;
if (_jsonArray) {
int bytesProcessed = 0;
if (line == buffer.get()) { // Only read on first pass - the whole array must be on one line.
bytesProcessed = getLine(in, line);
line += bytesProcessed;
len += bytesProcessed;
}
if ((bytesProcessed = parseJSONArray(line, o)) < 0) {
len += bytesProcessed;
break;
}
len += bytesProcessed;
line += len;
}
else {
if (!parseRow(in, o, len)) {
continue;
}
}
if ( _headerLine ) {
_headerLine = false;
}
else if (_doimport) {
bool doUpsert = _upsert;
BSONObjBuilder b;
if (_upsert) {
for (vector<string>::const_iterator it=_upsertFields.begin(), end=_upsertFields.end(); it!=end; ++it) {
BSONElement e = o.getFieldDotted(it->c_str());
if (e.eoo()) {
doUpsert = false;
break;
}
b.appendAs(e, *it);
}
}
if (doUpsert) {
conn().update(ns, Query(b.obj()), o, true);
}
else {
conn().insert( ns.c_str() , o );
}
}
num++;
}
catch ( std::exception& e ) {
cout << "exception:" << e.what() << endl;
cout << line << endl;
errors++;
if (hasParam("stopOnError") || _jsonArray)
break;
}
if ( pm.hit( len + 1 ) ) {
cout << "\t\t\t" << num << "\t" << ( num / ( time(0) - start ) ) << "/second" << endl;
}
}
cout << "imported " << num << " objects" << endl;
conn().getLastError();
if ( errors == 0 )
return 0;
cerr << "encountered " << errors << " error" << ( errors == 1 ? "" : "s" ) << endl;
return -1;
}
};
int main( int argc , char ** argv ) {
Import import;
return import.main( argc , argv );
}