/* iwdiff.cpp

Ignore whitespace diff
Ignores whitespace, and PROGRESS-style comments.
Only displays line numbers of first non-whitespace difference.

The first input file must be the preprocess output file,
the second input file must be the parser output file.

The preprocessor makes changes to the source if there are any
escape sequences. Our parser output does not. We have to watch
for these differences.


Notes
=====
I use a deque for lookahead, rather than peek(), because peek() combined
with putback() seems to be brittle (especially MinGW G++ 3.1). Grumble.
(And apparently unget() is always brittle, so I didn't even try it.)

The lookahead la(n) might be different than what getNext() returns,
since getNext() deals with comments, extra whitespace chars, etc, but
la(n) returns the exact characters in the lookahead queue.


To Do
=====
- This is unable to deal with "~032" as a character - istream.get() fails.
  "~032" = oct32 = dec26 = hex1A = "SUB" or ^Z.
  I don't know for sure why get() barfs at ^Z - I assume that it is
  because it is an end-of-file character.
  If "~032" is used, say, as a PUT CONTROL character (prodoc\langtut\lt-10-13.p
  calls it "Stop-Compress") then this program is hosed.
  I'm open to suggestions on this one.

*/

/*lint --e{716,731,529}
*/



#include "iwdiff.hpp"
#include <cctype>
#include <deque>
#include <fstream>
#include <sstream>


namespace joanju {
namespace iwdiff {


class SourceFile {
public:

	// Unlike comments, doingString is true/false for both files at any given time,
	// so it is a static (class) variable rather than an object member.
	static bool doingString;
	bool redo;
	bool returnSpace;
	char c; // is always equal to la(1)
	int commentLevel;
	int line;
	std::deque<char> lookahead;
	std::ifstream in;

	SourceFile() {
		c = 'x';
		commentLevel = 0;
		doingString = false;
		line = 1;
		redo = false;
		returnSpace = false;
	}

	void consume() {
		if (lookahead.size()!=0) {
			if (lookahead.front() == '\n')
				++line;
			lookahead.pop_front();
		}
	}

	char la(size_t i) {
		while (i > lookahead.size()) {
			lookahead.push_back(get());
		}
		return lookahead[i - 1];
	}

	char get() {
		static char g;
		if (!in.get(g))
			g = 0;
		return g;
	}

	char getNext () {
		if (redo) {
			redo = false;
			return c;
		}
		returnSpace = false;
		while (true) {
			consume();
			c = la(1);
			if (!c)
				return c;
			if (doingString) {
				return c;
			}
			else if (c=='/' && la(2)=='*') {
				// We aren't in a string - watch for and consume comments.
				consume();
				c = la(1);
				++commentLevel;
				returnSpace = true;
			}
			else if (c=='*' && commentLevel!=0 && la(2)=='/') {
				consume();
				c = la(1);
				--commentLevel;
			}
			// MSVC++ gives me debug/assert grief here on oddball chars without the cast
			else if (commentLevel==0 && isspace(static_cast<unsigned char>(c))) {
				returnSpace = true;
			}
			else if (commentLevel==0)
				break;
		}
		if (returnSpace) {
			lookahead.push_front(' ');
			c = ' ';
		}
		return c;
	}
}; // class SourceFile


} // namespace iwdiff
} // namespace joanju



// Define static class variables
bool joanju::iwdiff::SourceFile::doingString = false;



std::string joanju::iwdiff::iwdiff (const char* file1, const char* file2) {

	using joanju::iwdiff::SourceFile;

	char stringType = ' ';

	SourceFile f1;
	SourceFile f2;

	std::string theReturn;

	f1.in.open(file1, std::ios::binary);
	f2.in.open(file2, std::ios::binary);
	if (!f1.in.is_open()) {
		theReturn = "Failed to open file: ";
		theReturn.append(file1);
		return theReturn;
	}
	if (!f2.in.is_open()) {
		theReturn = "Failed to open file: ";
		theReturn.append(file2);
		return theReturn;
	}

	while(f1.c) {

		f1.getNext();
		f2.getNext();


		if (f1.c == f2.c) {

			// Watch for tilde escape sequences
			if (f1.c=='~' && f2.c=='~') {
				// Get the next character of the escape sequence.
				f1.getNext();
				f2.getNext();
				if (isdigit(f2.c) && isdigit(f2.la(2)) && isdigit(f2.la(3))) {
					// Three digits of an octal escape
					f2.getNext();
					f2.getNext();  // now at third digit
				}
				// Inside a string, Progress converts "~n" to "~\r\n".
				// Proparse leaves it as "~n".
				if (SourceFile::doingString && f2.c=='n') {
					if (f1.c=='\r')
						f1.getNext();	// f1 now at '\n', f2 now at 'n'.
				}
				// We don't bother comparing this character in the escape sequence.
				continue;
			}

			// Watch for "." handling within table/field names. See bug#016.
			if (f1.c=='.' && f2.c=='.') {
				f1.getNext();
				f2.getNext();
				if (f1.c==' ' && f2.c!=' ') {
					f2.redo = true;
					continue;
				} else {
					f1.redo = true;
					f2.redo = true;
					continue;
				}
			}

			// Start of new string
			if ( (f1.c=='"' || f1.c=='\'') && !SourceFile::doingString ) {
				SourceFile::doingString = true;
				stringType = f1.c;
				continue;
			}

			// End of string
			if (SourceFile::doingString && f1.c==stringType) {
				if (f2.la(2)==stringType) {
					// The preprocessor removes the escaping quote
					// in sequences like "aaa""aaa" to become "aaa"aaa".
					f2.getNext();
					continue;
				}
				SourceFile::doingString = false;
				continue;
			}

			// They match, and we don't have anything else that we need to
			// do when they match. We can continue on to the next character.
			continue;

		} // f1.c==f2.c


		// Ignore EOF whitespace
		if (f1.c==0 && f2.c==' ')
			f2.getNext();
		if (f1.c==' ' && f2.c==0)
			f1.getNext();

		// Test for different file lengths
		if (f1.c==0 && f2.c!=0) {
			theReturn = file1;
			theReturn += " is shorter than ";
			theReturn += file2;
			return theReturn;
		}
		if (f1.c!=0 && f2.c==0) {
			theReturn = file1;
			theReturn += " is longer than ";
			theReturn += file2;
			return theReturn;
		}

		// Progress's preprocessor turns tabs into correct number of spaces for 8-space columns.
		if (SourceFile::doingString && f1.c==' ' && f2.c=='\t') {
			// We just ignore the rest of the whitespace, rather than try to figure
			// out how to convert the tab to spaces.
			while (f1.c==' ')
				f1.getNext();
			f1.redo = true;  // Compare current character, next loop
			while (f2.c=='\t' || f2.c==' ')
				f2.getNext();
			f2.redo = true;  // Compare current character, next loop
			continue;
		}

		// Progress's preprocessor removes escaped newlines.
		// (Sort of... in a string it converts them to '\r' ??!!)
		// Proparse also removes them, but not inside string literals.
		if (SourceFile::doingString && f2.c=='~' && f1.c!='~') {
			bool doit = false;
			if (f2.la(2)=='\n') {
				f2.getNext();
				doit = true;
			} else if (f2.la(2)=='\r' && f2.la(3)=='\n') {
				f2.getNext();
				f2.getNext();
				doit = true;
			}
			if (doit) {
				if (f1.c=='\r')
					f1.getNext(); // prepro converted escape newline to CR
				f1.redo = true; // compare this character again, next loop
				continue;
			}
		}

		// Progress's preprocessor converts LF to CR LF on Windows.
		// Proparse does not.
		if (f1.c=='\r' && f1.la(2)=='\n' && f2.c=='\n') {
			f2.redo = true;
			continue;
		}

		// Progress's preprocessor will convert "~~\r\n" to "~~\r". Go figure.
		if (SourceFile::doingString && f2.c=='\n' && f1.c!='\n') {
			f1.redo = true;
			continue;
		}

		// Progress's preprocessor leaves in superfluous tildes in identifiers;
		// Proparse removes them.
		if (f1.c=='~' && f2.c!='~' && f1.la(2)==f2.c) {
			f2.redo = true;
			continue;
		}

		// Given a superfluous " ~n." in the code, Progress's preprocessor turns
		// it into " ~\r\n.", and Proparse turns it into " \n.".
		// Since we skip extra whitespace, we end up comparing '~' to '.'.
		if (	SourceFile::doingString==false
			&&	f1.c=='~' && f1.la(2)=='\r' && f1.la(3)=='\n' && f1.la(4)==f2.c
			) {
			f1.getNext(); // now f1.c==' ' (consumed both "\r\n")
			f1.getNext(); // matching char
			// Files are now (possibly) in sync. Continue to next char.
			continue;
		}

		// The preprocessor will discard a tilde in front of an unescaped tilde CR sequence.
		// I don't know why.
		if (f2.c== '~' && f1.c!='~' && f1.c==f2.la(2)) {
			f2.getNext();
			continue;
		}

		// Watch for "." handling within table/field names. See bug#016.
		if (f1.c==' ' && f2.c=='.') {
			f2.redo = true;
			continue;
		}

		// Watch for &string character sequences that Progress's preprocessor
		// might have stripped. See [Note 1] at bottom of this file.
		if (f2.c=='&' && f1.c!='&') {
			// Try to re-synch at next matching char.
			while (f2.c!=f1.c && f2.c!=0)
				f2.getNext();
			// Recheck current charpos, in case f2 is now at EOF.
			f1.redo = true;
			f2.redo = true;
			continue;
		}

		// Progress strips tabs and spaces after &THEN. Proparse does not.
		if (	SourceFile::doingString==false
			&&	(f1.c!=' ' && f1.c!='\t')
			&&	(f2.c==' ' || f2.c=='\t')
			) {
			while (f2.c==' ' || f2.c=='\t') f2.getNext();
			f1.redo = true;
			f2.redo = true;
			continue;
		}

		// Finally, check for mismatched character
		if (f1.c != f2.c) {
			std::ostringstream ost;
			ost << "Difference at line " << f1.line << " " << f1.c << ", " << f2.line << " " << f2.c;
			return ost.str();
		}


	} // while

	return "";			

} // iwdiff()



/* Notes

[Note 1]
This is particularly ugly. Given this code:
	def var theInt as int init 1.
	def frame a
	theInt view-as radio-set radio-buttons
	&One , 1 , &Two, 2, &Three,3
	.
	update theInt with frame a.
This is the output:
	def var theInt as int init 1.
	def frame a
	theInt view-as radio-set radio-buttons
	, 1 ,  2, 3
	.
	update theInt with frame a.
I have no idea why Progress's preprocessor would strip the &string
sequences. If we hit an unmatched &, then we consume until we match
another character, and then try to take it from there.

*/
