Files
ICEcoder/lib/indexer.php
Matt Pass 6ab5aa672d Far more intelligent indexing
Don't index *.min.* files, detect declaration lines with more intelligence by considering the format of the language and varying syntax, pick out the function name and args much better, don't store blank names, take only first word and so exclude things like classes extending/implementing format and no args on classes of course, plus consider if prev data before deciding upon whether to index
2019-10-05 19:13:20 +01:00

213 lines
11 KiB
PHP

<?php
include("headers.php");
include("settings.php");
// File extensions to look for functions & classes in
$indexableFileExts = ["php", "js", "coffee", "ts", "rb", "py", "sql", "erl", "java", "jl", "c", "cpp", "ino", "cs", "go", "lua", "pl"];
// Fallback for prevIndexData to start off initially
$prevIndexData = [];
// If we have a data/index.php file
if (file_exists($docRoot.$ICEcoderDir."/data/index.php")) {
// Get serialized array back out of PHP file inside a comment block as prevIndexData
$prevIndexData = file_get_contents($docRoot.$ICEcoderDir."/data/index.php");
if (strpos($prevIndexData, "<?php") !== false) {
$prevIndexData = str_replace("<?php\n/*\n\n", "", $prevIndexData);
$prevIndexData = str_replace("\n\n*/\n?>", "", $prevIndexData);
$prevIndexData = unserialize($prevIndexData);
}
}
// Roughly 1 in 100 index runs, we'll do a full index
if (mt_rand(1,100) === 50) {
$prevIndexData = [];
}
// Start a new indexData for this run
$indexData = [];
function phpGrep($path, $base) {
global $indexableFileExts, $prevIndexData, $indexData;
$fp = opendir($path);
global $ICEcoder, $serverType, $docRoot, $ICEcoderDir;
if (!isset($ret)) {$ret="";};
$slash = $serverType == strpos($path,"\\")>-1 ? "\\" : "/";
while($f = readdir($fp)) {
// Ignore . and .. paths
if ($f == "." || $f == "..") continue;
$filePath = $path.$slash.$f;
$filePathExt = pathinfo($filePath, PATHINFO_EXTENSION);
// Exclude the folder ICEcoder is running from
$rootPrefix = '/'.str_replace("/","\/",preg_quote(str_replace("\\","/",$docRoot))).'/';
$localPath = preg_replace($rootPrefix, '', $filePath, 1);
if (strpos($localPath, $ICEcoderDir)===0) {
continue;
}
if(is_dir($filePath)) {
$ret .= phpGrep($filePath, $base);
} else {
// Check if we should scan within this file, by only considering files that may contain functions & classes
if (in_array($filePathExt, $indexableFileExts) === false) {
continue;
}
// Check if file appears to be the same (same size and mtime), if so, continue as we'll assume it's not changed
if (isset($prevIndexData['files'][$filePath]) &&
$prevIndexData['files'][$filePath]['size'] === stat($filePath)['size'] &&
$prevIndexData['files'][$filePath]['mtime'] === stat($filePath)['mtime']
) {
// Continue, as data will be the same and we'll use data from prevIndexData
continue;
}
// Start file data block if we don't have one yet
if (!isset($indexData['files'][$filePath])) {
$indexData['files'][$filePath] = [
"size" => stat($filePath)['size'],
"mtime" => stat($filePath)['mtime']
];
}
$bFile = false;
// Exclude banned files
for ($i=0;$i<count($ICEcoder['bannedFiles']);$i++) {
if ($ICEcoder['bannedFiles'][$i] !== "") {
if (strpos($f,str_replace("*","",$ICEcoder['bannedFiles'][$i]))!==false) {$bFile = true;};
}
}
// Exclude *.min.* minified files
$minFileText = pathinfo(pathinfo($f)['filename']);
if (isset($minFileText['extension']) && $minFileText['extension'] === "min") {
continue;
}
if (!$bFile) {
$lines = file($filePath);
foreach ($lines as $lineNum => $line) {
$functionText = "";
$classText = "";
// Get function declaration lines, covering most language formats
if (
// If we have both parens in ( then ) order on the line and...
(strpos($line, "(") !== false && strpos($line, "(") < strpos($line, ")")) &&
// ...if a particular language and we have a valid format on the same line for it
(($filePathExt === "py" || $filePathExt === "rb") && strpos($line, "def") !== false && strpos($line, "def") < strpos($line, "(")) ||
(($filePathExt === "js" || $filePathExt === "ts") && strpos($line, "=>") !== false) ||
(($filePathExt === "erl" || $filePathExt === "coffee") && strpos($line, "->") !== false) ||
(($filePathExt === "c" || $filePathExt === "cpp") && strpos($line, "{") !== false && strpos($line, "{") > strpos($line, "(")) ||
($filePathExt === "go" && strpos($line, "func") !== false && strpos($line, "func") < strpos($line, "(")) ||
// ...or if the line contains "function" before opening parens...
(strpos($line, "function") !== false && strpos($line, "function") < strpos($line, "("))
) {
// ...it's enough of an indication this is a function declaration line, so grab name and args from the line
// First, strip away all non alphanum, underscore and parens chars, plus the word "function"
// (No need to remove "def" or "func" as we're only concerned by the string between function name and parens and both "def" and "func"
// appear before function name in Python, Ruby and Go languages, it's only "function" that's between name and args in some languages
$functionLine = preg_replace('/[^\da-z\s_\(\)]|\bfunction\b/i', '', $line)."\n";
// Then replace one or more spaces that are followed by an open parens with a single space and open parens
// then explode on the open parens to get the split between name and start of args
$functionLine = preg_replace('/\s+\(/', '(', $functionLine)."\n";
$functionLine = explode("(", $functionLine);
// Finally, we have our function name and args we can put into an array after some string manipulation
$functionText = [
0 => ltrim(substr($functionLine[0], strrpos($functionLine[0], " "))),
1 => "(".explode(")",$functionLine[1])[0].")"
];
}
// Get class declaration lines (far simpler than functions, as all languages have a very similar format
if (strpos($line, "class ") !== false) {
$classText = substr($line, strpos($line, "class ") + 6);
// Get just the name of the class
$classText = explode(" ", $classText);
}
// Function data
if (!empty($functionText) && $functionText[0] !== "") {
// Start language block if we don't have one yet
if (!isset($indexData['functions'][$filePathExt])) {
$indexData['functions'][$filePathExt] = [];
}
// Set all the data for this function
$indexData['functions'][$filePathExt][$functionText[0]] = [
"name" => $functionText[0],
"range" => [
"from" => [
"line" => $lineNum,
"ch" => strpos($line, $functionText[0])
],
"to" => [
"line" => $lineNum,
"ch" => (strpos($line, $functionText[0]) + strlen($functionText[0]))
]
],
"filePath" => $filePath,
"filePathExt" => $filePathExt,
"params" => str_replace(" ", ", ", $functionText[1])
];
}
// Class data
if (!empty($classText) && $classText[0] !== "") {
// Start language block if we don't have one yet
if (!isset($indexData['classes'][$filePathExt])) {
$indexData['classes'][$filePathExt] = [];
}
// Set all the data for this class
$indexData['classes'][$filePathExt][$classText[0]] = [
"name" => $classText[0],
"range" => [
"from" => [
"line" => $lineNum,
"ch" => strpos($line, $classText[0])
],
"to" => [
"line" => $lineNum,
"ch" => (strpos($line, $classText[0]) + strlen($classText[0]))
]
],
"filePath" => $filePath,
"filePathExt" => $filePathExt
];
}
}
}
}
}
return $ret;
}
// If we don't have a timestamp passed in, in prev data, or it's not the same as what's in the index...
if (!isset($_GET['timestamp']) || !isset($prevIndexData["timestamps"]) || $_GET['timestamp'] != $prevIndexData["timestamps"]["indexed"]) {
// If we don't have any prev data or something in the doc root changed, we can do an index...
if (!isset($prevIndexData["timestamps"]) || $prevIndexData["timestamps"]["indexed"] !== stat($docRoot)['mtime']) {
// Start a new indexData for this run
$indexData["timestamps"] = [
"indexed" => stat($docRoot)['mtime'],
"browser" => $_GET['timestamp'] ?? 0,
"changed" => true
];
// Start running function to index data
$results = phpGrep($docRoot.$iceRoot, $docRoot.$iceRoot);
// Overlay indexData ontop of prevIndexData
$output = array_replace_recursive($prevIndexData, $indexData);
// Store the serialized array in PHP comment block for next time
file_put_contents($docRoot.$ICEcoderDir."/data/index.php", "<?php\n/*\n\n".serialize($output)."\n\n*/\n?".">");
// Output what we have in our index...
} else {
$output = $prevIndexData;
}
// Else it's the same as last time so do nothing...
} else {
$output = [
"timestamps" => [
"indexed" => stat($docRoot)['mtime'],
"browser" => (int) $_GET['timestamp'],
"changed" => false
]
];
}
// Output the JSON
echo json_encode($output, JSON_PRETTY_PRINT);