JSDoc: Source: TextParser.js

/**
 * contains the loaded text from a file or a Wikipedia article
 */
var globalText;

/**
 * contains the words of the globalText
 */
var globalTextArray;

/**
 * contains a set of frequent English words which don't carry much information in a phrase-net
 */
var stopwords = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also",
				 "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway",
				 "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand",
				 "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant",
				 "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either",
				 "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few",
				 "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full",
				 "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon",
				 "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it",
				 "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill",
				 "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless",
				 "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only",
				 "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put",
				 "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere",
				 "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten",
				 "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon",
				 "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together",
				 "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well",
				 "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever",
				 "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet",
				 "you", "your", "yours", "yourself", "yourselves", "the"];



/**
 * generates a graph from the globalTextArray and the specified phrase from the main interface
 * 
 * @param {string} reg_all A string containing the regular expression used to build the phrase net
 * 
 * @returns A graph object containing nodes and links
 */
function parseText(reg_all)
{
	//////////////////////////////////////////////////////////////////
	console.log("-----------------------" + "\n" +
				"| PERFORMANCE LOGGING |" + "\n" +
				"-----------------------" + "\n" +
				"t := length of text" + "\n" +
				"n := number of displayed nodes/links" + "\n" +
				"-----------------------");
	var time_prev;
	var time_curr;
	time_prev = new Date().getTime();
	//////////////////////////////////////////////////////////////////
	
	
	/////////////// EXTRACT NODES AND LINKS ///////////////
	
	// use hash maps (simple JavaScript objects) to store the nodes and links obtained from parsing the text
	var simpleNodes = {};
	var simpleLinks = {};
	
	// check for regex and add nodes and links to has maps
	while ((match = reg_all.exec(globalText)) != null) {
		wordA = match[1].toLowerCase();
		wordB = match[2].toLowerCase();
		
		// check for first word/node
		if (simpleNodes[wordA] == null) {
			simpleNodes[wordA] = {name: wordA, inDegree: 0, outDegree: 1, count: 0}; // count will be determined later
		} else {
			simpleNodes[wordA].outDegree++;
		}
		
		// check for second word/node
		if (simpleNodes[wordB] == null) {
			simpleNodes[wordB] = {name: wordB, inDegree: 1, outDegree: 0, count: 0}; // count will be determined later
		} else {
			simpleNodes[wordB].inDegree++;
		}
		
		// check for link between first and second word/node
		// use the concatenated words as the link's id (unique and can be used for fast hash-map look-up)
		var link_id = wordA + "_" + wordB;
		if (simpleLinks[link_id] == null) {
			simpleLinks[link_id] = {source: simpleNodes[wordA], target: simpleNodes[wordB], count: 1};
		} else {
			simpleLinks[link_id].count++;
		}
	}
	
	
	//////////////////////////////////////////////////////////////////
	time_curr = new Date().getTime();
	console.log("EXTRACT NODES AND LINKS:" + "\n" + (time_curr - time_prev) + "ms ~ O(t)");
	time_prev = time_curr;
	//////////////////////////////////////////////////////////////////
	
	
	/////////////// FILTER NODES AND LINKS ///////////////
	
	///// filter nodes (and links) by stop-words /////
	
	// filter nodes by stop words
	for (var key in simpleNodes) {
        // skip loop if the property is from prototype
        if(!simpleNodes.hasOwnProperty(key)) continue;
        
        var node = simpleNodes[key];
        
        // check if node is a stop word
        if (stopwords.indexOf(node.name) != -1)
        	delete simpleNodes[key];
    }
	
	// filter links by stop words
	for (var key in simpleLinks) {
        // skip loop if the property is from prototype
        if(!simpleLinks.hasOwnProperty(key)) continue;
        
        var link = simpleLinks[key];
        
        // check if node is a stop word
        if (stopwords.indexOf(link.source.name) != -1 || stopwords.indexOf(link.target.name) != -1)
        	delete simpleLinks[key];
    }
	
	
	///// filter links by count /////
	
	// get links-array
	var simpleLinksArray = Object.keys(simpleLinks).map(function (key) {return simpleLinks[key];});
	
	// sort links according to count
	simpleLinksArray.sort(function(link1, link2) { return link2.count - link1.count; });
	
	// only select the top links with highest count
	var simpleLinksArrayFiltered = simpleLinksArray.slice(0, document.getElementById("input_maximumConnections").value);
	
	
	///// filter nodes by selected links /////
	
	// create map used for faster accessing of nodes (hash map vs. list) --> will be used for counting nodes in text
	var simpleNodesFilteredMap = {};
	simpleLinksArrayFiltered.forEach(function(link) {
		if (simpleNodesFilteredMap[link.source.name] == null)
			simpleNodesFilteredMap[link.source.name] = link.source;
		
		if (simpleNodesFilteredMap[link.target.name] == null)
			simpleNodesFilteredMap[link.target.name] = link.target;
	});
	
	// also save filtered nodes as array
	simpleNodesArrayFiltered = Object.keys(simpleNodesFilteredMap).map(function (key) {return simpleNodesFilteredMap[key];});
	
	
	//////////////////////////////////////////////////////////////////
	time_curr = new Date().getTime();
	console.log("FILTER NODES AND LINKS:" + "\n" + (time_curr - time_prev) + "ms ~ O(t)");
	time_prev = time_curr;
	//////////////////////////////////////////////////////////////////
	
	
	/////////////// DETERMINE EACH WORD-COUNT ///////////////
	
	for (var i = 0; i < globalTextArray.length; i++) {
		var word = globalTextArray[i];
		
		if (simpleNodesFilteredMap[word] != null)
			simpleNodesFilteredMap[word].count++;
	}
	

	//////////////////////////////////////////////////////////////////
	time_curr = new Date().getTime();
	console.log("DETERMINE EACH WORD-COUNT:" + "\n" + (time_curr - time_prev) + "ms ~ O(t)");
	time_prev = time_curr;
	//////////////////////////////////////////////////////////////////
	
	
	/////////////// MERGE NODES AND LINKS ACCORDING TO TOPOLOGY AND BRING THEM INTO STRUCTURES TO BE VISUALIZED BY THE MAPPER ///////////////

	///// merge nodes /////
	
	// create neighborhood map for filtered nodes
	var neighborhood = {};
	
	simpleLinksArrayFiltered.forEach(function(link) {
	   if (neighborhood[link.source.name] == null) {
	      neighborhood[link.source.name] = {inVertices: [], outVertices: [link.target.name]};
	   } else {
	      neighborhood[link.source.name].outVertices.push(link.target.name);
	   }

	   if (neighborhood[link.target.name] == null) {
	      neighborhood[link.target.name] = {inVertices: [link.source.name], outVertices: []};
	   } else {
	      neighborhood[link.target.name].inVertices.push(link.source.name);
	   }
	});
	
	
	var nodes_mergeStep1 = [];

	// merge nodes which form a complete sub-graph (representation as a loop)
	for (var i = 0; i < simpleNodesArrayFiltered.length; i++) {
		var node_1 = simpleNodesArrayFiltered[i];
		var node_1_NH = neighborhood[node_1.name];

		// check if node has already been visited and ignore it if so
		if (node_1.visited == true) continue;

		var mergedNode = {id: "node_" + i, subnodes: [{name: node_1.name, inDegree: node_1.inDegree, outDegree: node_1.outDegree, count: node_1.count}]};

		// change all links to point to the merged node instead of the old one
		simpleLinksArrayFiltered.forEach(function(link) {
			if (link.source.name == node_1.name) {
				link.source = mergedNode;
			}
			if (link.target.name == node_1.name) {
				link.target = mergedNode;
			}
		});
		
		for (var j = i+1; j < simpleNodesArrayFiltered.length; j++) {
			var node_2 = simpleNodesArrayFiltered[j];
			var node_2_NH = neighborhood[node_2.name];
			
			var checkForOccurence = function(neighborhood, nodeName) {
				return neighborhood.inVertices.findIndex(function(name) {
					return name == nodeName;
				}) != -1 &&
				neighborhood.outVertices.findIndex(function(name) {
					return name == nodeName;
				}) != -1;
			}

			// check if node_1 and node_2 might be in the same complete sub graph
			if (checkForOccurence(node_1_NH, node_2.name) && checkForOccurence(node_2_NH, node_1.name)) {
				// create copied neighborhood of node_2, where node_1 is removed and node_2 is added
				var node_2_NH_compare = {};
				node_2_NH_compare.inVertices = [node_2.name];
				node_2_NH_compare.outVertices = [node_2.name];
				
				for (var k = 0; k < node_2_NH.inVertices.length; k++) {
					if (node_2_NH.inVertices[k] != node_1.name && node_2_NH_compare.inVertices.indexOf(node_2_NH.inVertices[k]) == -1) {
						node_2_NH_compare.inVertices.push(node_2_NH.inVertices[k]);
					}
				}
				
				for (var k = 0; k < node_2_NH.outVertices.length; k++) {
					if (node_2_NH.outVertices[k] != node_1.name && node_2_NH_compare.outVertices.indexOf(node_2_NH.outVertices[k]) == -1) {
						node_2_NH_compare.outVertices.push(node_2_NH.outVertices[k]);
					}
				}
				
				if (compareNeighborhoods(node_1_NH, node_2_NH_compare) == true) {
					// mark node as visited
					node_2.visited = true;
					
					// merge the nodes
					mergedNode.subnodes.push({name: node_2.name, inDegree: node_2.inDegree, outDegree: node_2.outDegree, count: node_2.count});
					
					// change all links to point to the merged node instead of the old one
					simpleLinksArrayFiltered.forEach(function(link) {
						if (link.source.name == node_2.name) {
							link.source = mergedNode;
						}
						if (link.target.name == node_2.name) {
							link.target = mergedNode;
						}
					});
				}
			}
		}
		
		nodes_mergeStep1.push(mergedNode);
	}
	
	
	// create neighborhood map for new merged nodes
	var neighborhood_new = {};
	
	simpleLinksArrayFiltered.forEach(function(link) {
	   if (neighborhood_new[link.source.id] == null) {
		   neighborhood_new[link.source.id] = {inVertices: [], outVertices: [link.target.id]};
	   } else {
		   neighborhood_new[link.source.id].outVertices.push(link.target.id);
	   }

	   if (neighborhood_new[link.target.id] == null) {
		   neighborhood_new[link.target.id] = {inVertices: [link.source.id], outVertices: []};
	   } else {
		   neighborhood_new[link.target.id].inVertices.push(link.source.id);
	   }
	});
	
	
	var nodes_mergeStep2 = [];

	// merge nodes which have the same neighborhood
	for (var i = 0; i < nodes_mergeStep1.length; i++) {
		var node_1 = nodes_mergeStep1[i];
		
		// check if node has already been visited and ignore it if so
		if (node_1.visited == true) continue;
		
		var mergedNode = {id: "node_" + i, subnodes: node_1.subnodes};
		
		// change all links to point to the merged node instead of the old one
		simpleLinksArrayFiltered.forEach(function(link) {
			if (link.source.id == node_1.id) {
				link.source = mergedNode;
			}
			if (link.target.id == node_1.id) {
				link.target = mergedNode;
			}
		});
		
		for (var j = i+1; j < nodes_mergeStep1.length; j++) {
			var node_2 = nodes_mergeStep1[j];
			
			// check for same neighborhoods (and check if the node has no self loop --> otherwise a complete graph where each node has a self loop would be merged)
			if (compareNeighborhoods(neighborhood_new[node_1.id], neighborhood_new[node_2.id]) == true && neighborhood_new[node_1.id].inVertices.indexOf(node_1.id) == -1 ) {
				// mark node as visited
				node_2.visited = true;
				
				// merge the nodes
				mergedNode.subnodes = mergedNode.subnodes.concat(node_2.subnodes);
				
				// change all links to point to the merged node instead of the old one
				simpleLinksArrayFiltered.forEach(function(link) {
					if (link.source.id == node_2.id) {
						link.source = mergedNode;
					}
					if (link.target.id == node_2.id) {
						link.target = mergedNode;
					}
				});
			}
		}
		
		nodes_mergeStep2.push(mergedNode);
	}
	
	
	///// merge links /////
	
	// remove all links which don't point to a merged node (shouldn't exist!!!)
	// iterate backwards to avoid problems when removing links
	for (var i = simpleLinksArrayFiltered.length - 1; i >= 0; i--) {
		// check if there exists no id-value (only present in merged nodes)
		if (simpleLinksArrayFiltered[i].source.id == null || simpleLinksArrayFiltered[i].target.id == null) {
			console.log("The following link doesn't point to a merged node!");
			console.log(simpleLinksArrayFiltered[i]);
			
			// remove link
			simpleLinksArrayFiltered.splice(i, 1);
		}
	}
	
	
	var links = [];
	
	// merge all links with same source and target
	for (var i = 0; i < simpleLinksArrayFiltered.length; i++) {
		var link_1 = simpleLinksArrayFiltered[i];
		
		// check if link has already been visited and ignore it if so
		if (link_1.visited == true) continue;
		
		var mergedLink = {source: link_1.source, target: link_1.target, count: link_1.count};
		var linkCounter = 1;
		
		for (var j = i+1; j < simpleLinksArrayFiltered.length; j++) {
			var link_2 = simpleLinksArrayFiltered[j];

			// check for same source and target
			if (link_1.source.id == link_2.source.id && link_1.target.id == link_2.target.id) {
				// mark link as visited
				link_2.visited = true;
				
				// increase linkCounter
				linkCounter++;
				
				// update count of merged link
				mergedLink.count += link_2.count
			}
		}
		
		// calculate the average count of the merged links
		mergedLink.count /= linkCounter;
		
		links.push(mergedLink);
	}
	

	//////////////////////////////////////////////////////////////////
	time_curr = new Date().getTime();
	console.log("MERGE NODES AND LINKS ACCORDING TO TOPOLOGY:" + "\n" + (time_curr - time_prev) + "ms ~ O(n^2)");
	time_prev = time_curr;
	//////////////////////////////////////////////////////////////////
	
	
	/////////////// CREATE GRAPH CONTAINER ///////////////
	
	var graph = {"nodes" : nodes_mergeStep2, "links" : links};
	return graph;
}



/**
 * compares two neighborhoods and returns true if they contain the same in-node and out-nodes
 * 
 * @param {object} n1 The first neighborhood
 * @param {object} n2 The second neighborhood
 * 
 * @returns true if n1 and n2 contain the same in-node and out-nodes
 */
function compareNeighborhoods(n1, n2) {
	return (compareArrays(n1.inVertices, n2.inVertices) && compareArrays(n1.outVertices, n2.outVertices));
}



/**
 * compares two arrays of strings and returns true if they contain the same elements
 * 
 * @param {array} a The first array
 * @param {array} b The second array
 * 
 * @returns true if a and b contain the same elements
 */
function compareArrays(a, b) {
	if (a.sort().join(',') == b.sort().join(',')) {
	    return true;
	}
	
	return false;
}