Regex based simple search engine in PHP


Some notes about this code:

<?php
if (!empty($q)) {
    if (
is_array($q)) {
                    
$q join(" ",$q);
    } 
    
// prepare for urls in query
    
$q preg_replace("/\/|(http(s)*:|www|\.de|\.net|\.com|\.org)/",' ',$q);
    
textise($q); 
    
$query preg_split("/\s+/"$q);
    
$q join(" "$query); // a clean version 
} else {
    
$q '';
}

// prepare the searchform
ob_start();
?>
<form>
<input type="text" name="q" value="<?php echo $q?>"><br>
<input type="submit" value="brute force search">
</form>
<!-- http://www.feedster.com/search.php?hl=en&ie=UTF-8&q=test&btnG=Search&sort=date
-->
<form method="get" action="http://www.feedster.com/search.php" target="_blank">
<input type="hidden" name ="sort" value="relevance">
<input type="text" name="q" value="<?php echo $q?>"><br>
<input type="submit" value="feedster">
</form>
<form method="get" action="http://www.google.com/search" target="_blank">
<small>
<input type="text" name="q" size="16" maxlength="255" value="<?php echo $q?>"><br />
<input type="submit" name="sa" value="Google">
</small>
</form>
<form>
<input type="hidden" name="searchtype" value="amazon.de">
<input type="text" name="q" value="<?php echo $q?>"><br>
<label><input type="radio" name="where" value="books-de" <?
        
if (@$where=="books-de") echo 'checked';
?>>Deutsch</label>&nbsp;
<label><input type="radio" name="where" value="books-de-intl-us"  <?
        
if (@$where=="books-de-intl-us" || @$where=='') echo 'checked';
?>>Englisch</label><br>
<input type="submit" value="Amazon.de">
</form>

<?php
$sform 
ob_get_contents();
ob_end_clean();

if (
preg_match("/\w+/"$q)) {
    if (
$searchtype=="amazon.de") {
        
$bfout $sform;
        if(!
where$where="books-de-intl-us";
        
$amznurl "http://xml-eu.amazon.com/onca/xml3";
        
$amznurl .= "?t=traumwind-21&dev-t=D32B4SO2Q54BGU&locale=de&PowerSearch=";
        
$amznurl .= "keywords:+";
        
$amznurl .= urlencode($q);
        
$amznurl .= "&mode=";
        
$amznurl .= $where;
        
$amznurl .= "&type=lite&page=1&sort=+daterank&f=http://traumwind.de/books/traumwind.xsl"// 
        
        
$f fopen($amznurl"r");
        if (!
$f) die("<hr>can't open <a href='$amznurl'>$amznurl</a>"); 
        while (!
feof($f)) {
            
$amzn .= fread($f4096);
        }
        
$bfout .= 
        <style>
        amzn_list detail{
            margin: 1em;
            padding: 2em 1em 1em 1em;
            height: 240px;
            border-top: 1px solid black;
            border-left: 1px solid black;
            border-right: 3px solid black;
            border-bottom: 3px solid black;
        }
        </style>"
;
        if (
strlen($amzn)<=100) {
            
$bfout .= "<div id='amzn_list'><div id='detail'><h2>Amazon returned <b>no results</b></h2>
            <p>if you where using more than one keyword, try putting 'or' between them</p></div></div>\n"
;
        } else {
            
$bfout .= $amzn;
        }
        
$bfout .= "<br /><a href='$amznurl'>debug url</a><br />";
        
$sbeltoutput $bfout;
        
    } else {
        
// Brute Force PHP searching
        // take a list of keywords and simply scan each and every document for matches
        // (try to scan younger docs first)
        // martin@traumwind.de May 2003
        
        // where do we find the docs?
        
$postingspath "./archive";
        
// how do we identify a doc from other files in dir?
        
$postingsregex ".txt$";
        
$percperhit 1/count($query);
        
        if(
$q && $q!='') {
            
$bfout $sform;
            
$posts findposts($postingspath$postingsregex);
            
$hits = Array();
            foreach (
$posts as $pname=>$ppath) {
                
$s join(""file($ppath));
                
preg_match("/title ([^\n]+)/"$s$m);
                
$title $m[1];
                
textise($s);
                foreach(
$query as $k) {
                    
// $k=quotemeta($k);
                    // die ("<pre>$k</pre>");
                    
if (!preg_match('/\w+/'$k)) continue;
                    if (
$res preg_match_all("!\s$k\s!i"$s$m))  {
                        
$total[$pname] += $percperhit;
                        
$words[$pname][$k] = $res;
                        
$titles[$pname] = strip_tags($title);
                    }
                }
            }
            if (
$total) {
                
arsort($total);       
                
$bfout .= "<ol>\n";
                foreach(
$total as $pname=>$rel) {
                    
$bfout .= "<li>(".floor($rel*100).") <a href='index.php?detail=$pname&'>";
                    
$bfout .= $titles[$pname];
                    
$bfout .= "</a><br>";
                    
arsort($words[$pname]);
                    foreach(
$words[$pname] as $word=>$count) {
                        
$bfout .= "<b>$word</b>: $count&nbsp;\n";
                    }
                    
$bfout .= "</li>\n";
                }
                
$bfout .= "</ol>\n";
            } else {
                
$bfout .= "<p>no results for <b>'$q'</b></p>\n";
            }
            
// let's view this in the content area
            
$sbeltoutput $bfout;
        } 
    }        
}
// functions
function textise(&$s) {
        
$search = array (
                
"'title ([^\n]+)*'",
                
"'author \w+'",
                
"'<script[^>]*?>.*?</script>'si",  // Strip out javascript
                
"'<[\/\!]*?[^<>]*?>'si",           // Strip out html tags
                
"'[^\w]'",                 // Strip out punctuation
                
);
        
$replace = array (
                
"$1 $1",
                
"",
                
"",
                
"",
                
" ",
                 );
        
$s preg_replace ($search$replace$s); 
}
function 
findposts($path$regex) {
        
$handle opendir($path) or die ("$path was not found");
         while(
$entry=readdir($handle)) {
                if (
eregi($regex$entry)){
                        
$entry_name=eregi_replace($regex,"",$entry);
                        
$dir_arr[$entry_name]="$path/".$entry;
                }
        }
        
closedir($handle);
        
// sort in alphabetical order
        
arsort($dir_arr);
        return (
$dir_arr);
}
?>


alles Bild, Text und Tonmaterial ist © Martin Spernau, Verwendung und Reproduktion erfordert die Zustimmung des Authors

Martin Spernau
© 1994-2016


amazon.de Wunschliste

Facebook me!
Google+

Google

powered by Traumtank