. * * * User defined settings * ~~~~~~~~~~~~~~~~~~~~~ * If you wish to run AssEd on your own server, you will need to change the three settings below. * You will also need to make sure that PHP is running with the sessions module installed and the HttpRequest extension available. * AssEd can use a lot of memory, storage, and bandwidth -- make sure you keep an eye on these. * */ // How should AssEd identify itself to Entrez when requesting data? $ncbi_Tool = "AssEd"; //What is your email address? $ncbi_Email = "you@yoursite.com"; // folder to store cached files in $ncbi_CachePath = "/path/to/your/htdocs/cache/"; // number of items to fetch from Entrez in each request -- between 1000 and 2500 is good $ncbi_RetMax = 2500; /* * You should not edit anything other than those four lines, unless you really know what you're doing! * * * Changelog * ~~~~~~~~~ * - v0.5b, 2008-09-28: * - Moved to using translated query for cache file names (md5 encoded to solve length problem), thus eliminating duplicated caches for e.g. plurals * - Moved most of the cache handling sections of code out into dedicated functions to make the whole thing easier to follow and update * - Added user variable for retmax, allowing one to fine tune the balance between bandwidth and processing per page vs page loads required (also raised default from 1000 to 2500) * - Minor fix: increase PHP's memory limit for large datasets * - Minor fixes in preparation for new month cache updates * - Minor fix: HTML is now utf-8 for correct handling of non-standard characters in names * - v0.4b, 2008-09-20: * - Added a temporary cache of author list for every 50,000 results retrieved so that the list array can be deleted, thus vastly * improving the time taken to process each results file. * - Minor fix: filter results previously caused an error if none of the filters were on * - Minor fix: the delete redundant cache files function was moved to before exit; is called * - Minor fix: added brackets to the search term on the links in results as a fix for searches that contain OR clauses * - Minor fix: escaped quote marks to %32 in search terms on the links in results * - Minor fix: WebEnv fix from v0.3 moved to before file cache to prevent endless reloading * - Minor fix: unprocessed XML caches now expire at the end of each month rather than after 7 days * - v0.3b, 2008-09-09: * - Added columns for filter values to the results page * - Added filters for last authorship and date ranges for when the author began publishing * - Minor fix: run PubMedIds again when the WebEnv session id expires * * ********************************************************************************************************************************************/ error_reporting(E_ALL); ini_set('display_errors', '1'); session_start(); if (isset($_POST['NewSession'])) session_unset(); // Check if this is a new search and, if so, delete old session variables $t = new AssEd(); // Run the object class AssEd { var $Stage; // array - allows us to remember where we are in the process: 0=search/filters, 1=search number, 2=stage within the search var $PostData; // array - everything in the search form -- see this->GetPostData() var $AuthorList; // array - the final list of authors during the filtering stage var $Strings; // array - search terms var $AllResults; // integer - count of searches (between 1 and 5) var $CurResults; // array - the results of an ongoing search var $Connection; // object - HttpRequest (PHP extension) for communicating with Entrez var $Xml; // string - result of an Entrez query in XML format var $Array = array(); // array - result of an Extrez query in array format (use $this->Array = $this->XmlParser()) var $WebEnv; // string - Entrez's "WebEnv" variable, which is basically a session key for the pubmed search - see http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html var $QueryKey; // int - Entrez's "QueryKey" variable -- I'm not sure this is actually important, or if I've been using it properly, but it has never been an issue so I have ignored it! var $QueryData; // array - contains the variables to be sent to Entrez in the next query -- see the relevant pages at http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html var $SearchTerm; // string - currently active search term var $MinDate; // string - currently active year limit (Note: this is not actually the minimum date, it is the number of years to go back to. The name is confusing for historical reasons!) var $Translation; // string - currently active query translation var $Title; // string - the title for the output page var $Html; // string - the page specific HTML, for use in the OutputHtml function Var $StickyHtml; // string - HTML which is carried over multiple pages /******************************************************************************************************************************************** * Function: GetPostData() * General switching function and reloader of saved variables between * page loads. */ function AssEd() { Global $_GET; // Create HTTP connection $this->Connection = new HttpRequest; // Extract variables from the session cookies if (isset($_SESSION['Stage'])) $this->Stage = $_SESSION['Stage']; if (isset($_SESSION['PostData'])) $this->PostData = $_SESSION['PostData']; if (isset($_SESSION['WebEnv'])) $this->WebEnv = $_SESSION['WebEnv']; if (isset($_SESSION['QueryKey'])) $this->QueryKey = $_SESSION['QueryKey']; if (isset($_SESSION['AuthorList'])) $this->AuthorList = $_SESSION['AuthorList']; if (isset($_SESSION['CurResults'])) $this->CurResults = $_SESSION['CurResults']; if (isset($_SESSION['StickyHtml'])) $this->StickyHtml = $_SESSION['StickyHtml']; else $this->StickyHtml = ""; $this->Html = ""; // Set up an empty string, for the sake of PHP compliance if (!isset($this->Stage)) { // If we are not already within a search, set things up and start the first one rolling $_SESSION['began'] = microtime(true); // Make a note of the start time, in order to calculate the time elapsed. $this->GetPostData(); // Extracts all of the form data $this->Stage = array("searches", 1, 1); // Note where we are in the searches for when the page reloads } // This checks if we are within a search -- either because we have just set up the first search (above) // or because the page has just reloaded and we are in the middle of another search if ($this->Stage[0] == "searches") { $this->PubmedSearch(); // This function sets up the searches -- either starting a new one, or continuing the existing one // If the search is incomplete, the program will have exited as the page reloads -- therefore, the following // events only happen if the search is complete $n = $this->Stage[1] + 1; if (!empty($this->PostData['Searches'][$n])) { // Check if there is another search to do and, if so, reset the stage variable and reload $this->Stage[1] = $n; // Reset the stage to that of the next search $this->Stage[2] = 1; } else $this->Stage[0] = "filters"; // Otherwise, reset the stage ready to begin the filters stage $this->SaveReload(); // And now reload the page } // If we are not in the middle of a search, we should be in the middle of the filters. If we're in neither, something must have gone wrong somewhere! elseif ($this->Stage[0] == "filters") { $this->RunFilters(); // All of the filters are called from within this function (they do not actually // need to be a separate function, but this way of doing things is the tidiest.) $this->Stage = array("complete", 0, 0); $this->PrintResults(); // This one prints the results -- as above, separating this one is for tidiness' sake } elseif ($this->Stage[0] == "complete") { if (isset($_GET['order'])) { // A reordering system $order = $_GET['order']; foreach ($this->AuthorList as $Author => $Items) { foreach ($Items as $Term => $Count) { $temp[$Term][$Author] = $Count; } } arsort($temp[$order]); foreach ($temp[$order] as $Author => $Count) { $temp2[$Author][$order] = $Count; } foreach ($temp as $Term => $Items) { foreach ($Items as $Author => $Count) { $temp2[$Author][$Term] = $Count; } } $this->AuthorList = $temp2; } $this->PrintResults(); } } /******************************************************************************************************************************************** * Function: GetPostData() * Gets the search info. This does not actually need to be a separate * function as it is only called from a single location, but this makes * things easier. */ function GetPostData() { for ($i = 1; $i <= 8; $i++) { // Loop through each the search strings saving the values for: $tSearchString = "SearchString-" . $i; // -- the search string itself $tAtLeast = "AtLeast-" . $i; // -- the minimum number of publications $tAtMost = "AtMost-" . $i; // -- and the maximum $tSince = "Since-" . $i; // -- the number of years to search (the name is slightly misleading as, originally, this was // a year (e.g. 'since 1998') rather than number of years (e.g. 'from the past 10 years') if (empty($_POST[$tSearchString])) { // We must, of couse, check that this search field was actually used: if it is empty, we are at $this->Strings = $i - 1; // the end of the loop and can save the global variable "Strings" (an integer count of all of the break; // searches, and then break from it. } $this->PostData['Searches'][$i] = array( // Otherwise, save the form data in the global "PostData" variable "SearchString" => trim($_POST[$tSearchString]), "AtLeast" => trim($_POST[$tAtLeast]), "AtMost" => trim($_POST[$tAtMost]), "Since" => trim($_POST[$tSince]) ); // Warn people using dp or edat that they do not need to if (strpos(strtolower($_POST[$tSearchString]), "[dp]") OR strpos(strtolower($_POST[$tSearchString]), "[edat]")) { $this->StickyHtml .= "

Error: it looks like you are trying to specify a date range in the search string. You do not need to do this! For several reasons, the date range must be specified in the date range field only.

\n"; $this->OutputHtml(false); exit; } } // Save the seniority limits if (!empty($_POST['Active-1yr'])) { $this->PostData['Active'] = true; $this->PostData['AtLeast-1yr'] = $_POST['AtLeast-1yr']; $this->PostData['AtMost-1yr'] = $_POST['AtMost-1yr']; } elseif (!empty($_POST['Active-1yr-simple'])) { $this->PostData['Active'] = true; $this->PostData['AtLeast-1yr'] = 1; $this->PostData['AtMost-1yr'] = 1000; } else $this->PostData['Active'] = false; if (!empty($_POST['Active-5yr'])) { $this->PostData['Active-5yr'] = true; $this->PostData['AtLeast-5yr'] = $_POST['AtLeast-5yr']; $this->PostData['AtMost-5yr'] = $_POST['AtMost-5yr']; } elseif (!empty($_POST['Active-5yr-simple'])) { $this->PostData['Active-5yr'] = true; $this->PostData['AtLeast-5yr'] = 10; $this->PostData['AtMost-5yr'] = 1000; } else $this->PostData['Active-5yr'] = false; if (!empty($_POST['Active-All'])) { $this->PostData['Active-All'] = true; $this->PostData['AtLeast'] = $_POST['AtLeast']; $this->PostData['AtMost'] = $_POST['AtMost']; } else $this->PostData['Active-All'] = false; if (!empty($_POST['Active-Senior'])) { $this->PostData['Active-Senior'] = true; $this->PostData['AtLeastSe'] = $_POST['AtLeastSe']; $this->PostData['AtMostSe'] = $_POST['AtMostSe']; } else $this->PostData['Active-Senior'] = false; if (!empty($_POST['Active-Old'])) { $this->PostData['Active-Old'] = true; $this->PostData['LimOld'] = $_POST['LimOld']; $this->PostData['LimYoung'] = $_POST['LimYoung']; } else $this->PostData['Active-Old'] = false; if (!empty($_POST['Active-Old-Senior'])) { $this->PostData['Active-Old-Senior'] = true; $this->PostData['SeLimOld'] = $_POST['SeLimOld']; $this->PostData['SeLimYoung'] = $_POST['SeLimYoung']; } else $this->PostData['Active-Old-Senior'] = false; // Set the ignore list if (!empty($_POST['Ignore'])) $this->PostData['Ignore'] = $_POST['Ignore']; if (!empty($_POST['IgnoreCoAus'])) $this->PostData['IgnoreCoAus'] = $_POST['IgnoreCoAusYear']; return true; } /******************************************************************************************************************************************** * Function: OutputHtml( Reload ) * Formats the page as HTML */ function OutputHtml($Reload = false) { $Title = $this->Title; $Html = $this->StickyHtml . $this->Html; if ($Reload) $Reload = "\n"; else $Reload = ""; $Time = round((microtime(true) - $_SESSION['began']), 2); // Calculate the time elapsed echo << $Title $Reload

AssEdbeta: the semi-automatic assistant editor

$Html

... time elapsed: $Time seconds.

END; exit; } /******************************************************************************************************************************************** * Function: SaveReload() * Saves variables and reloads the page to move on to the next stage */ function SaveReload() { $_SESSION['Stage'] = $this->Stage; $_SESSION['PostData'] = $this->PostData; $_SESSION['QueryKey'] = $this->QueryKey; $_SESSION['StickyHtml'] = $this->StickyHtml; if (isset($this->WebEnv)) $_SESSION['WebEnv'] = $this->WebEnv; if (isset($this->AuthorList)) $_SESSION['AuthorList'] = $this->AuthorList; if (isset($this->AllResults)) $_SESSION['AllResults'] = $this->AllResults; if (isset($this->CurResults)) $_SESSION['CurResults'] = $this->CurResults; if ($this->Stage[0] == "searches") { foreach ($this->PostData['Searches'] as $k => $Search) { if ($this->Stage[1] > $k) $this->Html .= "

\n

" . $Search['SearchString'] . ": 100% (" . $Search['Count'] . " / " . $Search['Count'] . ")

\n"; elseif (($this->Stage[1] == $k) and !empty($Search['Count'])) { $width = ($this->Stage[2] / $Search['Count']) * 600; $pc = round(($width / 6), 2); $width = round($width); $width2 = 600 - $width; $this->Html .= "

\n

" . $Search['SearchString'] . ": " . $pc . "% (" . $this->Stage[2] . " / " . $Search['Count'] . ")

\n"; } else $this->Html .= "

\n

" . $Search['SearchString'] . ": 0%

\n"; } $this->Html .= "

\n

Post-search filters: 0%

\n"; } else { if (empty($_SESSION['Filtered']) and isset($_SESSION['ToFilter'])) $_SESSION['Filtered'] = $_SESSION['ToFilter'] - count($this->AllResults[1]); foreach ($this->PostData['Searches'] as $k => $Search) { $this->Html .= "

\n

" . $Search['SearchString'] . ": 100% (" . $Search['Count'] . " / " . $Search['Count'] . ")

\n"; } if (isset($_SESSION['Filtered']) and isset($_SESSION['ToFilter'])) { $width = ($_SESSION['Filtered'] / $_SESSION['ToFilter']) * 600; $pc = round(($width / 6), 2); $width = round($width); $width2 = 600 - $width; $this->Html .= "

\n

Applying limits and filters: " . $pc . "%

\n"; } else $this->Html .= "

\n

Post-search filters: 0%

\n"; } $this->Title = "AssEd: Processing searches..."; $this->OutputHtml(true); } /******************************************************************************************************************************************** * Function: PrintResults() * Prints the search results */ function PrintResults() { $_SESSION['Stage'] = $this->Stage; // Save these two variables into the session to enable re-ordering if (isset($this->AuthorList)) $_SESSION['AuthorList'] = $this->AuthorList; $this->Title = "AssEd: Search complete"; if (empty($this->AuthorList)) $this->Html .= "

Sorry, it would appear that nobody has that particularly combination of skills.

\n"; else { $this->Html .= "
\n\n"; $this->Html .= "\n"; $this->Html .= "\n"; // Print the column headers $i = 0; foreach ($this->PostData['Searches'] as $Search) { $i += 1; $this->Html .= "\n"; } // Check the limits if ($this->PostData['Active-All']) $this->Html .= "\n"; if ($this->PostData['Active-5yr']) $this->Html .= "\n"; if ($this->PostData['Active']) $this->Html .= "\n"; if ($this->PostData['Active-Senior']) $this->Html .= "\n"; $this->Html .= "\n"; // Loop through the Authors $j = count($this->PostData['Searches']); foreach ($this->AuthorList as $Author => $Items) { $this->Html .= "\n"; // Loop through the search terms for ($i = 1; $i <= $j; $i++) { $this->Html .= "\n"; } // Check the limits if ($this->PostData['Active-All']) $this->Html .= "\n"; if ($this->PostData['Active-5yr']) { $t = array(date('Y'), date('m'), date('d')); $pdat2 = implode("/", $t); $t[0] -= 5; $pdat1 = implode("/", $t); $this->Html .= "\n"; } if ($this->PostData['Active']) { $t = array(date('Y'), date('m'), date('d')); $pdat2 = implode("/", $t); $t[0] -= 1; $pdat1 = implode("/", $t); $this->Html .= "\n"; } if ($this->PostData['Active-Senior']) $this->Html .= "\n"; $this->Html .= "\n"; } $this->Html .= "
Author" . $Search['SearchString'] . "All pubs 5yr 1yr Last Au
" . $Author . "PostData['Searches'][$i]['SearchString']) . ")\">" . $Items[$i] . "" . $Items['All'] . "" . $Items['5yr'] . "" . $Items['1yr'] . "" . $Items['Senior'] . "
\n
\n"; } $this->OutputHtml(false); } /************************************************************************************************************************************************* NCBI / ENTREZ FUNCTIONS *********************** 1. GetXmlFile 2. -GetCounts- -- Removed in stripped down version, PubmedIDs gives counts 3. PubmedIds 4. PubmedSummaries 5. PubmedSearch 6. PubmedAuthors 7. FilterSearches **************************************************************************************************************************************************/ /**************************************************************************************************************************************** * Function: GetXmlFile ( URL, QueryData ) * - Runs an HTTP Request or a MySql query depending on whether cache * is switched on or not */ function GetXmlFile($Url, $QueryData) { Global $ncbi_Tool, $ncbi_Email, $ncbi_CachePath; $QueryData2 = $QueryData; // Start by making a filename for the cache unset($QueryData2['WebEnv']); // Remove the WebEnv setting -- it prevents a cached file being used accross sessions foreach ($QueryData2 as &$temp) { // Remove troublesome charactors... $temp = str_replace("/","",$temp); // Need to remove '/' from query data -- as used in mindate/maxday querydata, to prevent filesystem errors $temp = trim($temp, "."); // Remove surrounding dots from the data -- I don't think there is any way this could actually be used in an attack, but it's best to be safe! $temp = trim($temp); // And any accidental whitespace -- without this, one would waste time if the correctly entered version is already cached } if ($this->Stage[0] == "searches") { $filename = $ncbi_CachePath . "xml_files/" . trim($this->MinDate, ".") . "," . trim($this->SearchTerm, ".") . "," . implode(",", $QueryData2) . ".txt"; $_SESSION['Cleanup'][] = $filename; // Items in this session variable will deleted when the search completes -- they should never be needed again } elseif ($this->Stage[0] == "filters") { // Because there are so many authors to check, we need to break them into separate folders somehow $dir = $ncbi_CachePath . "user_records/" . substr(str_replace("\"","",$QueryData2['term']), 0, 1) . "/"; if (!is_dir($dir)) mkdir($dir); $filename = $dir . str_replace("[Au]","", str_replace("\"","",implode(",", $QueryData2))) . ".txt"; } if (file_exists($filename)) { // We need to check whether we already have the search cached if ((date('Y', filectime($filename)) != date('Y')) OR (date('m', filectime($filename)) != date('m'))) unlink($filename); // If the cached file is from last month, delete it and start again else { $this->Xml = file_get_contents($filename); // But otherwise, we can read it into the Xml global, and exit the function return true; } } $QueryData['tool'] = $ncbi_Tool; // Add tool and email to QueryData -- this is requested by ncbi to help them $QueryData['email'] = $ncbi_Email; // deal with any problems that may arise. Set these in the "user settings" $this->Connection->setUrl ( $Url ); // Now send the URL and QueryData array to the Connection object -- this uses $this->Connection->setQueryData ( $QueryData ); // PHP's HttpRequest extension try { $this->Connection->send(); } catch (Exception $e) { // HttpRequest is really bad at handling connection errors -- we need to catch exceptions and reload the page when these errors occur $this->Html .= "

Warning: caught an exception when trying to retreive the XML data from PubMed. I am going to reload the page and hope that the problem goes away. The error given was: " . $e->getMessage() . "

\n"; $this->SaveReload(); } $this->Xml = $this->Connection->GetResponseBody(); // Convert the response into a string, saved in the global 'Xml' if (strpos($this->Xml, "Unable to obtain query #1", 0)) { $this->Html .= "

Warning: PubMed returned error #1. I am going to try to refresh the PubMed session variable, then reload the page and try again.

\n"; $this->PubmedIds(); // if we get this error, the WebEnv session ID has probably expired, so it is time $this->SaveReload(); // to get the webenv again, and do a reload } $f = fopen($filename, 'w'); // Cache the new file fwrite($f, $this->Xml); fclose($f); if (!file_exists($filename)) $this->Html .= "

Warning: could not save cache file (" . $filename . ")

"; return true; } /**************************************************************************************************************************************** * Function: PubmedSearch() * I broke this out from AssEd() because I was using the same bit of code three times. This checks whether we've done the initial * PubmedIds() search, and if not, runs it. Then it's just the ExtractAuthors. * AssEd is actually much more efficiently arranged now, so it does not actually need to be separate, but, hey, it's neater this way. * The cache updater section of this function was originally a separate function, but I kept making it more code efficient, and eventually merged. */ function PubmedSearch() { Global $ncbi_CachePath; /***/ // Set some important variables $this->SearchTerm = trim($this->PostData['Searches'][$this->Stage[1]]['SearchString']); $this->MinDate = trim($this->PostData['Searches'][$this->Stage[1]]['Since']); // if the count and translation haven't been set, or the PubMed WebEnv variable is missing get them now. if (($this->Stage[2] == 1) OR (empty($this->WebEnv)) OR (empty($this->Translation))) $this->PubmedIds(true); if ($this->PostData['Searches'][$this->Stage[1]]['Count'] > 1000000) ini_set('memory_limit', '1024M'); // Make sure we have enough memory to deal with all of the results $this->Translation = $this->PostData['Searches'][$this->Stage[1]]['Translation']; /***/ // Check if we have a cached author list /***** TEMP FIX ******/ // This is to update cache file names from v0.4b to v0.5b $o = $ncbi_CachePath . "author_lists/" . trim($this->MinDate, ".") . "," . str_replace("/", "", trim($this->SearchTerm, ".")) . ".txt"; $c = $this->GetCacheName(); // When the old cache file names have been updated, the line above and the line below can be removed if (file_exists($o) and !file_exists($c)) { $this->StickyHtml .= "

Notice: Updated old cache file from v0.4 to v0.5 for '{$this->SearchTerm}'.

\n"; rename($o, $c); } /*** END TEMP FIX ***/ if (($s = $this->CheckCache()) !== 0) { // See if we already have this search cached $Attribs = $this->GetCacheAttribs(); // If the cache is up-to-date, we can fetch the attributes and then exit this search ready for the next one $this->PostData['Searches'][$this->Stage[1]]['Count'] = $Attribs['Count']; if ($s == 2) { // If the cache is old, we'll have to update it /*** CACHE UPDATER ***/ $this->CurResults = $this->GetCacheResults(); // First, extract the old cache results $old_count = count($this->CurResults); $updated = getdate($Attribs['Updated']); $updated = array($updated['year'], $updated['mon'], 01); $now = getdate(); $now = array($now['year'], $now['mon'], 01); $this->PubmedIDs(true, implode("/", $updated), implode("/", $now), false); // Add the new items to them if (!empty($this->PostData['Searches'][$this->Stage[1]]['Count'])) $this->PubmedExtractAuthors(true, false); $added = count($this->CurResults) - $old_count; $added_papers = $this->PostData['Searches'][$this->Stage[1]]['Count']; $updated[0] -= $this->MinDate; $now[0] -= $this->MinDate; $this->PubmedIDs(true, implode("/", $updated), implode("/", $now), false); // And remove the old ones if (!empty($this->PostData['Searches'][$this->Stage[1]]['Count'])) $this->PubmedExtractAuthors(true, true); $removed = ($old_count + $added) - count($this->CurResults); $removed_papers = $this->PostData['Searches'][$this->Stage[1]]['Count']; $this->PubMedIDs(); // Run a normal PubMedIDs in order to get the new hit count $this->SaveAuthors(); $new_count = count($this->CurResults); $this->StickyHtml .= "

Notice: updated cache for {$this->SearchTerm}: added $added counts for $added_papers papers and removed $removed counts for $removed_papers papers. New total is $new_count authors from {$this->PostData['Searches'][$this->Stage[1]]['Count']} papers.

\n"; unset($this->CurResults); } unset($this->Translation, $this->WebEnv); return true; } /***/ // Otherwise, run the search if ($this->PubmedExtractAuthors() === false) exit; /***/ // We will only ever get this far if the search has completed // - so, we can now cache the author list $this->SaveAuthors(); unset($this->Translation, $this->WebEnv, $_SESSION['WebEnv'], $this->CurResults, $_SESSION['CurResults']); return true; } /*************************************************************************************************************************************** * Function: PubmedIds ( ) * - Gets a list of PubMed IDs and saves an Entrez session variable for later retrieval of the full items */ function PubmedIds($tidy = false, $mindate = false, $maxdate = false, $escape_empty_results = true) { Global $ncbi_Tool, $ncbi_Email; // Die on error -- This should NEVER occur, and can probably be deleted. if (empty($this->SearchTerm)) die( "

Error: missing search term for new search in this->PubmedIds.

" ); // Construct the Query Data $QueryData = array( "db" => "pubmed", "retmode" => "xml", "usehistory" => "y", "term" => $this->SearchTerm, ); if ($mindate and $maxdate) { $QueryData['mindate'] = $mindate; $QueryData['maxdate'] = $maxdate; } elseif (!empty($this->PostData['Searches'][$this->Stage[1]]['Since'])) { $y = date('Y') - $this->PostData['Searches'][$this->Stage[1]]['Since']; $QueryData['mindate'] = $y . "/" . date('m') . "/01"; $QueryData['maxdate'] = date('Y/m/01'); } $this->GetXmlFile( "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi", $QueryData ); // Get the XML file $this->Array = $this->XmlParser( $this->Xml ); // Parse it unset($this->Xml); // clean up! $this->QueryKey = $this->Array['eSearchResult']['QueryKey']; // Save entrez's "session" variables $this->WebEnv = $this->Array['eSearchResult']['WebEnv']; if ($escape_empty_results and empty($this->Array['eSearchResult']['Count'])) { // Check that pubmed actually has some entries matching the query $this->Html .= "

Sorry, PubMed has no items matching your search for " . $QueryData['term'] . ". Check your spelling, or try an older date cutoff.

\n"; $this->OutputHtml(); exit; } $Translation = str_replace(""", "\"", $this->Array['eSearchResult']['QueryTranslation']); // Get the query translation //$this->PostData['Searches'][$this->Stage[1]]['Translation'] = substr($Translation, 1, (strrpos($Translation, ")")-1)); // Cut everything outside the brackets in the translation -- this cuts off the EDAT conditions to make a cache month non-specific $this->PostData['Searches'][$this->Stage[1]]['Translation'] = substr($Translation, 0, -40); // Cut everything outside the brackets in the translation -- this cuts off the EDAT conditions to make a cache month non-specific $this->PostData['Searches'][$this->Stage[1]]['Count'] = $this->Array['eSearchResult']['Count']; // Update the counts (GetCounts() does not support data ranges) if (!empty($tidy)) { // If we're being tidy, remove all the old variables and return true unset($this->Array); return true; } return $this->Array; // Otherwise, return the array } /**************************************************************************************************************************************** * Function PubmedExtractAuthors ( single, reverse, noisy ) * - Takes the search and finds all authors * I'm not entirely sure what the most efficient way of doing this is. I have tried using the XML>Array function, but this is * insanely memory intensive. The simple loop and text-search works, but I'm not convinced that there aren't better ways (a * preg_match, perhaps?) to do it. * Input vars: * - Single: gets all results in a single file rather than [$ncbi_RetMax] items at a time (used when updating caches at the start of the month) * - Reverse: subtracts matching authors from the list, instead of increasing their counts * - Noisy: debug mode -- prints the list of changed authors */ function PubmedExtractAuthors($single = false, $reverse = false) { Global $ncbi_CachePath, $ncbi_RetMax; // Check if we have a partial cache if (!$single and ($this->Stage[2] == 1)) { $i = 50001; while (true) { $c = $ncbi_CachePath . "tmp/" . trim($this->MinDate, ".") . "," . md5($this->Translation) . "," . $i; if ((file_exists($c)) and (date('Y', filectime($c)) == date('Y')) and (date('m', filectime($c)) == date('m'))) { $this->Stage[2] = $i; $i += 50000; $_SESSION['Cleanup'][] = $c; continue; } else break; } } // Run the search $this->QueryData = array( "db" => "pubmed", "retmode" => "xml", "usehistory" => "y", "WebEnv" => $this->WebEnv, "query_key" => $this->QueryKey, "retstart" => $this->Stage[2], "retmax" => $ncbi_RetMax ); if ($single) $this->QueryData['retmax'] = $this->PostData['Searches'][$this->Stage[1]]['Count']; // If we're retrieving this as a single file, amend the "retmax" to get all of the results $this->GetXmlFile( "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi", $this->QueryData ); // Get the XML file $lines = explode("\n", $this->Xml); // Extract all of the authors from the XML string foreach ($lines as $line) { $line = trim($line); if (substr($line, 0, 34) == "") { $NewAuthor = substr($line, 34, -7); if ($reverse) { // Check if we're in reverse mode $this->CurResults[$NewAuthor] -= 1; // if so - reduce count rather than increase it if (empty($this->CurResults[$NewAuthor])) unset($this->CurResults[$NewAuthor]); // And clean up if the author no longer has any pubs } else { if (!empty($this->CurResults[$NewAuthor])) $this->CurResults[$NewAuthor] += 1; else $this->CurResults[$NewAuthor] = 1; } } } if ($single) return true; // If we are running in single-search mode, it is time to return $this->Stage[2] += $ncbi_RetMax; // Otherwise, move on to the next searches if ($this->Stage[2] < $this->PostData['Searches'][$this->Stage[1]]['Count']) { $a = ($this->Stage[2]-1) / 50000; // Check whether we should save a cache and empty the author list variable in order to reduce $b = round($a); // time taken to process results. (when we get > 50000 results worth of authors in the if ($a == $b) { // CurResults array, things really slow down. $this->TempSave(); unset($this->CurResults, $_SESSION['CurResults'], $_SESSION['AllResults'][$this->Stage[1]]); } $this->SaveReload(); } if ($this->Stage[2] > 50000) $this->TempRead(); // If we make it this far, the search is over. Before we can move on though, we need to read out the temp files return true; // Move on } /******************************************************************************************************************************************** * Function: RunFilters() * Filters the author lists */ function RunFilters() { Global $ncbi_CachePath; // Get results from the files if (!empty($_SESSION['AllResults'])) $this->AllResults = $_SESSION['AllResults']; else { foreach ($this->PostData['Searches'] as $i => $Search) { if (!$this->CheckCache($i)) die("Missing cache file"); $this->AllResults[$i] = $this->GetCacheResults($i, $Search['AtLeast'], $Search['AtMost']); } arsort($this->AllResults[1]); } if (empty($_SESSION['ToFilter'])) $_SESSION['ToFilter'] = count($this->AllResults[1]); // Filter $j = count($this->PostData['Searches']); $k = 0; $l = 0; foreach ($this->AllResults[1] as $key => $value) { $r = true; // Check the author shows up in all lists foreach ($this->AllResults as $AuthorList) { if (empty($AuthorList[$key])) $r = false; } // Now check the author has enough pubs if ($r) { if (!$AuStats = $this->PubmedAuLimits($key)) $r = false; $k += 1; } // Exclude based on the ignore list if ($r AND isset($this->PostData['Ignore']) and isset($this->PostData['IgnoreCoAus'])) { if ($this->PubmedIgnore($key)) $r = false; $k += 1; } if ($r != false) { $this->AuthorList[$key][1] = $value; for ($i = 2; $i <= $j; $i++) { $this->AuthorList[$key][$i] = $this->AllResults[$i][$key]; unset($this->AllResults[$i][$key]); } if (isset($AuStats['All'])) $this->AuthorList[$key]['All'] = $AuStats['All']; if (isset($AuStats['5yr'])) $this->AuthorList[$key]['5yr'] = $AuStats['5yr']; if (isset($AuStats['1yr'])) $this->AuthorList[$key]['1yr'] = $AuStats['1yr']; if (isset($AuStats['Senior'])) $this->AuthorList[$key]['Senior'] = $AuStats['Senior']; } unset($this->AllResults[1][$key]); $l += 1; if ($k > 200) { if (!empty($_SESSION['Filtered'])) $_SESSION['Filtered'] += $l; else $_SESSION['Filtered'] = $l; $this->Stage = array("filters", 0, 0); $this->SaveReload(); exit; } continue; } return true; } /**************************************************************************************************************************************** * Function: PubmedAuLimits ( Author ) * - Checks whether the author meets the seniority requirements */ function PubmedAuLimits($Au) { $Author = "\"" . $Au . "\""; $Url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"; $QueryData = array( "db" => "pubmed", "retmode" => "xml", "rettype" => "count", "usehistory" => "y", "term" => $Author, ); /* 1: Total number of publications */ if ($this->PostData['Active-All']) { $this->GetXmlFile($Url, $QueryData); $this->Array = $this->XmlParser($this->Xml); $k = $this->Array['eSearchResult']['Count']; if (($k < $this->PostData['AtLeast']) OR ($k > $this->PostData['AtMost'])) return false; $r['All'] = $k; } /* 2: Between A and B pubs in past five years */ if ($this->PostData['Active-5yr']) { $QueryData['reldate'] = "1825"; $this->GetXmlFile($Url, $QueryData); $this->Array = $this->XmlParser($this->Xml); $c = $this->Array['eSearchResult']['Count']; if (($c < $this->PostData['AtLeast-5yr']) OR ($c > $this->PostData['AtMost-5yr'])) return false; $r['5yr'] = $c; } /* 3: At least X pubs in past year */ if ($this->PostData['Active']) { $QueryData['reldate'] = "365"; $this->GetXmlFile($Url, $QueryData); $this->Array = $this->XmlParser($this->Xml); $y = $this->Array['eSearchResult']['Count']; if (($y < $this->PostData['AtLeast-1yr']) OR ($y > $this->PostData['AtMost-1yr'])) return false; $r['1yr'] = $y; } if (isset($QueryData['reldate'])) unset($QueryData['reldate']); /* 4: At least X pubs as senior author */ $QueryData['term'] .= "[lastau]"; // Amend the query term to last author if ($this->PostData['Active-Senior']) { $this->GetXmlFile($Url, $QueryData); $this->Array = $this->XmlParser($this->Xml); $s = $this->Array['eSearchResult']['Count']; if (($s < $this->PostData['AtLeastSe']) OR ($s > $this->PostData['AtMostSe'])) return false; $r['Senior'] = $s; } /* 5. Years began publishing */ $QueryData['term'] = $Author; if ($this->PostData['Active-Old']) { $mindate = array((date('Y')-$this->PostData['LimOld']), date('m'), "01"); $maxdate = array((date('Y')-$this->PostData['LimYoung']), date('m'), "01"); $QueryData['mindate'] = "1900/01/01"; $QueryData['maxdate'] = implode("/", $mindate); // See if there is anything older than the old cutoff $this->GetXmlFile($Url, $QueryData); $this->Array = $this->XmlParser($this->Xml); if (!empty($this->Array['eSearchResult']['Count'])) return false; $QueryData['mindate'] = implode("/", $mindate); // And make sure that there is something older than the yound cutoff $QueryData['maxdate'] = implode("/", $maxdate); $this->GetXmlFile($Url, $QueryData); $this->Array = $this->XmlParser($this->Xml); if (empty($this->Array['eSearchResult']['Count'])) return false; } /* 6: Years began publishing as senior author */ $QueryData['term'] .= "[lastau]"; if ($this->PostData['Active-Old-Senior']) { $mindate = array((date('Y')- $this->PostData['SeLimOld']), date('m'), "01"); $maxdate = array((date('Y')- $this->PostData['SeLimYoung']), date('m'), "01"); $QueryData['mindate'] = "1900/01/01"; $QueryData['maxdate'] = implode("/", $mindate); // See if there is anything older than the old cutoff $this->GetXmlFile($Url, $QueryData); $this->Array = $this->XmlParser($this->Xml); if (!empty($this->Array['eSearchResult']['Count'])) return false; $QueryData['mindate'] = implode("/", $mindate); // And make sure that there is something older than the yound cutoff $QueryData['maxdate'] = implode("/", $maxdate); $this->GetXmlFile($Url, $QueryData); $this->Array = $this->XmlParser($this->Xml); if (empty($this->Array['eSearchResult']['Count'])) return false; } if (isset($r)) return $r; else return true; } /**************************************************************************************************************************************** * Function: PubmedIgnore ( Author ) * - Checks whether the author matches the entries in the ignore list */ function PubmedIgnore($key) { $s = $key . " AND (" . $this->PostData['Ignore'] . ")"; $Url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"; $QueryData = array( "db" => "pubmed", "retmode" => "xml", "rettype" => "count", "usehistory" => "y", "mindate" => $this->PostData['IgnoreCoAus'], "maxdate" => date('Y'), "term" => $s, ); $this->GetXmlFile($Url, $QueryData); $this->Array = $this->XmlParser($this->Xml); if (!empty($this->Array['eSearchResult']['Count'])) return true; return false; } /************************************************************************************************************************************************* FILESYSTEM/CACHE FUNCTIONS ************************** 1. GetCacheName - returns full path to md5 hashed translated query filename 2. GetCacheAttribs - returns an assoc for the file metadata above the [Results] header 3. GetCacheResults - returns an assoc for the search results 4. CheckCache - checks to see if the cache file exists, and if so, if it is current 5. SaveAuthors - dumps the author list in a cache file 6. TempSave - dumps the author list to a temporary file - this saves memory and cpu as the CurResults array grows, and also prevents loss of data if search is interrupted 7. TempRead - extracts the author list from the temporary file and reads it back into this->CurResults **************************************************************************************************************************************************/ /*************************************************************************************************************************************** * Function: GetCacheName ( ) * - Makes the cache filename */ function GetCacheName($Search = null, $Path = null) { Global $ncbi_CachePath; if (empty($Search)) $Search = $this->Stage[1]; // Set the QueryTranslation to the currently active query $Translation = $this->PostData['Searches'][$Search]['Translation']; $MinDate = $this->PostData['Searches'][$Search]['Since']; if (empty($Path)) $Path = "author_lists/"; // The default path is to the final author lists cache $f = $ncbi_CachePath . $Path . $MinDate . "-" . md5($Translation) . ".txt"; // Build the filename from the full path and md5 scrambled translation return $f; } /*************************************************************************************************************************************** * Function GetCacheAttribs * - Gets everything above the [Results] section from the cache file */ function GetCacheAttribs($Search = null) { if (empty($Search)) $Search = $this->Stage[1]; // Set the QueryTranslation to the currently active query $f = $this->GetCacheName($Search); if (!file_exists($f)) { $this->StickyHtml .= "

Warning: script ran GetCacheAttribs() on a non-existent cache.

\n"; return 0; // Return 0 for files which do not exist } $x = file_get_contents($f); // Extract from the file $x = str_replace("\r\n", "\n", $x); // This is a (temp?) fix for processing *nix generated author lists on windows servers $a = explode("[Results]\n", $x); $a = explode("\n", $a[0]); foreach ($a as $k => $v) { if (empty($v)) continue; $v = explode("=", $v); $r[$v[0]] = $v[1]; } /*** TEMP FIX ***/ // This is for v0.3 cache files which had Expiry attributes not Updated attribs. It can be deleted once the old cache files have been updated if (empty($r['Updated'])) $r['Updated'] = filectime($f); return $r; } /*************************************************************************************************************************************** * Function GetCacheResults * - Extracts the author list from a cache file and returns it as an array */ function GetCacheResults($Search = null, $AtLeast = null, $AtMost = null) { if (empty($Search)) $Search = $this->Stage[1]; // Set the QueryTranslation to the currently active query if (!$this->CheckCache($Search)) return false; // If the cache doesn't exist, return false $f = $this->GetCacheName($Search); $x = file_get_contents($f); // Extract from the file $x = str_replace("\r\n", "\n", $x); // This is a (temp?) fix for processing *nix generated author lists on windows servers $a = explode("[Results]\n", $x); $a = explode("\n", $a[1]); foreach ($a as $k => $v) { if (empty($v)) continue; $v = explode("=", $v); if (isset($AtLeast) and ($v[1] < $AtLeast)) continue; if (isset($AtMost) and ($v[1] > $AtMost)) continue; $r[$v[0]] = $v[1]; } return $r; } /*************************************************************************************************************************************** * Function CheckCache * - checks to see if the cache file exists, and if so, if it is current */ function CheckCache($Search = null) { if (empty($Search)) $Search = $this->Stage[1]; // Set the QueryTranslation to the currently active query $f = $this->GetCacheName($Search); if (!file_exists($f)) return 0; // Return 0 for files which do not exist $a = $this->GetCacheAttribs($Search); if ((date('Y', $a['Updated']) == date('Y')) and (date('m', $a['Updated']) == date('m'))) return 1; // Return 1 for files which are up-to-date return 2; // Return 2 for files which need updating } /**************************************************************************************************************************************** * Function: SaveAuthors * - I wanted to use an XML layout for the saved authors list, but that considerably raises the amount of memory required (e.g. to get * the array back out of the xml file). Therefore, we have to put up with a cheap and dirty solution * - In v0.4 this had a "TempSave" option, now moved to a dedicated function */ function SaveAuthors() { $cutoff = 1; // If there are too many results, we can ignore all who have only one item $total = count($this->CurResults); // for the sake of performance if ($total > 250000) $cutoff = 2; // (the original cutoff was based on results for the search - i.e. this->PostData[Searches][n][Count]) foreach ($this->CurResults as $author => $count) { if ($count >= $cutoff) $a[] = $author . "=" . $count; } if ($this->CheckCache() and ($o = $this->GetCacheAttribs())) { // Find out when the cache was first made if (!empty($o['Created'])) $Created = $o['Created']; // This if/else is to cope with pre v0.5 cache files which do not have the created else $Created = $o['Updated']; // data - when those cache files have been updated, this can be removed. } else $Created = time(); $t = "SearchTerm=" . $this->SearchTerm . "\nMinDate=" . $this->MinDate . "\nQueryTranslation=" . $this->Translation ."\nCreated=" . $Created ."\nUpdated=" . time() . "\nCount=" . $this->PostData['Searches'][$this->Stage[1]]['Count'] . "\nAuCount=" . count($this->CurResults) . "\n\n[Results]\n" . implode("\n", $a); $c = $this->GetCacheName(); // Save the cache to the file if (file_exists($c)) unlink($c); $f = fopen($c, 'w+'); fwrite($f, $t); fclose($f); unset($a, $t, $f, $o); if (!empty($_SESSION['Cleanup'])) foreach ($_SESSION['Cleanup'] as $f) { if (file_exists($f)) unlink($f); } // At this point we can delete the redundant XML files return true; } /**************************************************************************************************************************************** * Function: TempSave * - Makes a temporary file of results part way through the search just in-case it gets interrupted * - Moved out from SaveAuthors in v0.5 */ function TempSave() { Global $ncbi_CachePath; $c = $ncbi_CachePath . "tmp/" . trim($this->MinDate, ".") . "," . md5($this->Translation) . "," . $this->Stage[2]; foreach ($this->CurResults as $author => $count) { $a[] = $author . "=" . $count; } $t = "[Results]\n" . implode("\n", $a); // Stripped down version of SaveAuthors(); if (file_exists($c)) unlink($c); $f = fopen($c, 'w+'); fwrite($f, $t); fclose($f); $_SESSION['Cleanup'][] = $c; // Add this to the list of temporary files which will need deleting at the end of the search return true; } /**************************************************************************************************************************************** * Function TempRead * - Extracts saved authors from temporary files and reads them into the CurResults array */ function TempRead() { Global $ncbi_CachePath; $i = 50001; while ($i <= $this->PostData['Searches'][$this->Stage[1]]['Count']) { $c = $ncbi_CachePath . "tmp/" . trim($this->MinDate, ".") . "," . md5($this->Translation) . "," . $i; if (file_exists($c)) { $x = file_get_contents($c); $x = str_replace("\r\n", "\n", $x); // This is a fix for processing *nix generated author lists on windows servers $a = explode("[Results]\n", $x); unset($x, $c); $a = explode("\n", $a[1]); foreach ($a as $k => $r) { $r = explode("=", $r); if (empty($r[1])) continue; if (!empty($this->CurResults[$r[0]])) $this->CurResults[$r[0]] += $r[1]; else $this->CurResults[$r[0]] = $r[1]; unset($a[$k], $r); // Memory saving device -- very large datasets can cause the memory to exceed 512M here } $i += 50000; continue; } else break; } return true; } /**************************************************************************************************************************************** * Function: XmlParser * - A generic XML to Array function */ function XmlParser($input = false) { if (empty($input)) $input = $this->XML; // First parse into struct $parser = xml_parser_create(); xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, 0); xml_parse_into_struct($parser, $input, $vals, $index); xml_parser_free($parser); unset($input, $index); // From http://mysrc.blogspot.com/2007/02/php-xml-to-array-and-backwards.html $mnary=array(); $ary=&$mnary; foreach ($vals as $r) { $t=$r['tag']; if ($r['type']=='open') { if (isset($ary[$t])) { if (isset($ary[$t][0])) $ary[$t][]=array(); else $ary[$t]=array($ary[$t], array()); $cv=&$ary[$t][count($ary[$t])-1]; } else $cv=&$ary[$t]; if (isset($r['attributes'])) {foreach ($r['attributes'] as $k=>$v) $cv['_a'][$k]=$v;} $cv['_c']=array(); $cv['_c']['_p']=&$ary; $ary=&$cv['_c']; } elseif ($r['type']=='complete') { if (isset($ary[$t])) { // same as open if (isset($ary[$t][0])) $ary[$t][]=array(); else $ary[$t]=array($ary[$t], array()); $cv=&$ary[$t][count($ary[$t])-1]; } else $cv=&$ary[$t]; if (isset($r['attributes'])) {foreach ($r['attributes'] as $k=>$v) $cv['_a'][$k]=$v;} $cv['_v']=(isset($r['value']) ? $r['value'] : ''); } elseif ($r['type']=='close') { $ary=&$ary['_p']; } } unset($vals); $this->_del_p($mnary); return $mnary; } // _Internal: Remove recursion in result array function _del_p(&$ary) { foreach ($ary as $k => &$v) { if ($k === "_p") unset($ary[$k]); elseif (is_array($ary[$k])) $this->_del_p($ary[$k]); // Added JAD 24/5/08: Removes the _c/_v/_a when there's only one entry -- // makes the array easier to handle if (is_array($v) and (count($v) == 1) and isset($v['_v'])) $v = $v['_v']; elseif (is_array($v) and (count($v) == 1) and isset($v['_c'])) $v = $v['_c']; } } } ?>