nodejs/tools/license2rtf.js

'use strict';

const assert = require('assert');
const Stream = require('stream');
const inherits = require('util').inherits;


/*
 * This filter consumes a stream of characters and emits one string per line.
 */
function LineSplitter() {
  const self = this;
  var buffer = '';

  Stream.call(this);
  this.writable = true;

  this.write = function(data) {
    const lines = (buffer + data).split(/\r\n|\n\r|\n|\r/);
    for (var i = 0; i < lines.length - 1; i++) {
      self.emit('data', lines[i]);
    }
    buffer = lines[lines.length - 1];
    return true;
  };

  this.end = function(data) {
    this.write(data || '');
    if (buffer) {
      self.emit('data', buffer);
    }
    self.emit('end');
  };
}
inherits(LineSplitter, Stream);


/*
 * This filter consumes lines and emits paragraph objects.
 */
function ParagraphParser() {
  const self = this;
  var block_is_license_block = false;
  var block_has_c_style_comment;
  var paragraph_line_indent;
  var paragraph;

  Stream.call(this);
  this.writable = true;

  resetBlock(false);

  this.write = function(data) {
    parseLine(data + '');
    return true;
  };

  this.end = function(data) {
    if (data) {
      parseLine(data + '');
    }
    flushParagraph();
    self.emit('end');
  };

  function resetParagraph() {
    paragraph_line_indent = -1;

    paragraph = {
      li: '',
      in_license_block: block_is_license_block,
      lines: []
    };
  }

  function resetBlock(is_license_block) {
    block_is_license_block = is_license_block;
    block_has_c_style_comment = false;
    resetParagraph();
  }

  function flushParagraph() {
    if (paragraph.lines.length || paragraph.li) {
      self.emit('data', paragraph);
    }
    resetParagraph();
  }

  function parseLine(line) {
    // Strip trailing whitespace
    line = line.replace(/\s*$/, '');

    // Detect block separator
    if (/^\s*(=|"){3,}\s*$/.test(line)) {
      flushParagraph();
      resetBlock(!block_is_license_block);
      return;
    }

    // Strip comments around block
    if (block_is_license_block) {
      if (!block_has_c_style_comment)
        block_has_c_style_comment = /^\s*(\/\*)/.test(line);
      if (block_has_c_style_comment) {
        var prev = line;
        line = line.replace(/^(\s*?)(?:\s?\*\/|\/\*\s|\s\*\s?)/, '$1');
        if (prev === line)
          line = line.replace(/^\s{2}/, '');
        if (/\*\//.test(prev))
          block_has_c_style_comment = false;
      } else {
        // Strip C++ and perl style comments.
        line = line.replace(/^(\s*)(?:\/\/\s?|#\s?)/, '$1');
      }
    }

    // Detect blank line (paragraph separator)
    if (!/\S/.test(line)) {
      flushParagraph();
      return;
    }

    // Detect separator "lines" within a block. These mark a paragraph break
    // and are stripped from the output.
    if (/^\s*[=*-]{5,}\s*$/.test(line)) {
      flushParagraph();
      return;
    }

    // Find out indentation level and the start of a lied or numbered list;
    const result = /^(\s*)(\d+\.|\*|-)?\s*/.exec(line);
    assert.ok(result);
    // The number of characters that will be stripped from the beginning of
    // the line.
    const line_strip_length = result[0].length;
    // The indentation size that will be used to detect indentation jumps.
    // Fudge by 1 space.
    const line_indent = Math.floor(line_strip_length / 2) * 2;
    // The indentation level that will be exported
    const level = Math.floor(result[1].length / 2);
    // The list indicator that precedes the actual content, if any.
    const line_li = result[2];

    // Flush the paragraph when there is a li or an indentation jump
    if (line_li || (line_indent !== paragraph_line_indent &&
                    paragraph_line_indent !== -1)) {
      flushParagraph();
      paragraph.li = line_li;
    }

    // Set the paragraph indent that we use to detect indentation jumps. When
    // we just detected a list indicator, wait
    // for the next line to arrive before setting this.
    if (!line_li && paragraph_line_indent !== -1) {
      paragraph_line_indent = line_indent;
    }

    // Set the output indent level if it has not been set yet.
    if (paragraph.level === undefined)
      paragraph.level = level;

    // Strip leading whitespace and li.
    line = line.slice(line_strip_length);

    if (line)
      paragraph.lines.push(line);
  }
}
inherits(ParagraphParser, Stream);


/*
 * This filter consumes paragraph objects and emits modified paragraph objects.
 * The lines within the paragraph are unwrapped where appropriate. It also
 * replaces multiple consecutive whitespace characters by a single one.
 */
function Unwrapper() {
  const self = this;

  Stream.call(this);
  this.writable = true;

  this.write = function(paragraph) {
    const lines = paragraph.lines;
    const break_after = [];
    var i;

    for (i = 0; i < lines.length - 1; i++) {
      var line = lines[i];

      // When a line is really short, the line was probably kept separate for a
      // reason.
      if (line.length < 50) {
        // If the first word on the next line really didn't fit after the line,
        // it probably was just ordinary wrapping after all.
        var next_first_word_length = lines[i + 1].replace(/\s.*$/, '').length;
        if (line.length + next_first_word_length < 60) {
          break_after[i] = true;
        }
      }
    }

    for (i = 0; i < lines.length - 1;) {
      if (!break_after[i]) {
        lines[i] += ' ' + lines.splice(i + 1, 1)[0];
      } else {
        i++;
      }
    }

    for (i = 0; i < lines.length; i++) {
      // Replace multiple whitespace characters by a single one, and strip
      // trailing whitespace.
      lines[i] = lines[i].replace(/\s+/g, ' ').replace(/\s+$/, '');
    }

    self.emit('data', paragraph);
  };

  this.end = function(data) {
    if (data)
      self.write(data);
    self.emit('end');
  };
}
inherits(Unwrapper, Stream);


/*
 * This filter generates an rtf document from a stream of paragraph objects.
 */
function RtfGenerator() {
  const self = this;
  var did_write_anything = false;

  Stream.call(this);
  this.writable = true;

  this.write = function({ li, level, lines, in_license_block: lic }) {
    if (!did_write_anything) {
      emitHeader();
      did_write_anything = true;
    }

    if (li)
      level++;

    var rtf = '\\pard';
    rtf += '\\sa150\\sl300\\slmult1';
    if (level > 0)
      rtf += '\\li' + (level * 240);
    if (li) {
      rtf += '\\tx' + (level) * 240;
      rtf += '\\fi-240';
    }
    if (lic)
      rtf += '\\ri240';
    if (!lic)
      rtf += '\\b';
    if (li)
      rtf += ' ' + li + '\\tab';
    rtf += ' ';
    rtf += lines.map(rtfEscape).join('\\line ');
    if (!lic)
      rtf += '\\b0';
    rtf += '\\par\n';

    self.emit('data', rtf);
  };

  this.end = function(data) {
    if (data)
      self.write(data);
    if (did_write_anything)
      emitFooter();
    self.emit('end');
  };

  function toHex(number, length) {
    var hex = (~~number).toString(16);
    while (hex.length < length)
      hex = '0' + hex;
    return hex;
  }

  function rtfEscape(string) {
    return string
      .replace(/[\\{}]/g, function(m) {
        return '\\' + m;
      })
      .replace(/\t/g, function() {
        return '\\tab ';
      })
      // eslint-disable-next-line no-control-regex
      .replace(/[\x00-\x1f\x7f-\xff]/g, function(m) {
        return '\\\'' + toHex(m.charCodeAt(0), 2);
      })
      .replace(/\ufeff/g, '')
      .replace(/[\u0100-\uffff]/g, function(m) {
        return '\\u' + toHex(m.charCodeAt(0), 4) + '?';
      });
  }

  function emitHeader() {
    self.emit('data', '{\\rtf1\\ansi\\ansicpg1252\\uc1\\deff0\\deflang1033' +
                      '{\\fonttbl{\\f0\\fswiss\\fcharset0 Tahoma;}}\\fs20\n' +
                      '{\\*\\generator txt2rtf 0.0.1;}\n');
  }

  function emitFooter() {
    self.emit('data', '}');
  }
}
inherits(RtfGenerator, Stream);


const stdin = process.stdin;
const stdout = process.stdout;
const line_splitter = new LineSplitter();
const paragraph_parser = new ParagraphParser();
const unwrapper = new Unwrapper();
const rtf_generator = new RtfGenerator();

stdin.setEncoding('utf-8');
stdin.resume();

stdin.pipe(line_splitter);
line_splitter.pipe(paragraph_parser);
paragraph_parser.pipe(unwrapper);
unwrapper.pipe(rtf_generator);
rtf_generator.pipe(stdout);