相信很多朋友都找到那些数据并下载下来了。
但是一下下来就傻眼了,大的是7、8G的SQL文件,稍小的也是别人导成CSV格式的也有3、4G之大。普通的文本工具一打开就崩溃,即使不崩溃,电脑内存瞬间吃光也会卡死。

借由这个契机,写了几个PHP小程序去查询,顺便回顾了一下PHP中的多进程和多线程编程。

常规编程,单线程,单进程,顺序执行。

<?php
class Searcher{
    protected $fileList = [];
         
    public function __construct()
    {
        $this->currentTasks[0] = 'main';
        $dir = dir( dirname(__FILE__).'/2000W' );
        while ( ($file = $dir->read() )!=FALSE ){
            if ( strpos($file,'.csv') >0 ){
                $this->fileList[] = dirname(__FILE__).'/2000W/' . $file;
            }
        }
        $dir->close();           
    }   
     
     
     
    public function run($word)
    {
        $start_time = time();
        foreach($this->fileList as $file){
            findInFile($file,$word);
        }
        echo "Finished in ", time()-$start_time , "seconds.\n";
    }
     
}
function findInFile($file,$word){
    echo "start searching $word in $file \n";
    $fp = fopen($file,'rb');
    $i = 0;
    while (!feof($fp)){
        $i ++;
        $line = fgets($fp);
        if ( strstr($line,$word) ){                 
            echo "$file [$l]: $line \n ";
        }
    }
    fclose($fp);
    echo "Finish searching in $file \n ";
}
 
 
$obj = new Searcher();
$obj->run('张三');

现在尝试一下用pnctl扩展实现多进程:

<?php
class Searcher{
    protected $fileList = [];
         
    public function __construct()
    {
        $this->currentTasks[0] = 'main';
        $dir = dir( dirname(__FILE__).'/2000W' );
        while ( ($file = $dir->read() )!=FALSE ){
            if ( strpos($file,'.csv') >0 ){
                $this->fileList[] = dirname(__FILE__).'/2000W/' . $file;
            }
        }
        $dir->close();           
    }   
     
    public function runMultiProcess($word)
    {
        echo "Begin...\n";
        $start_time = time();
        foreach($this->fileList as $file){
            $lanched = $this->runProcess($file,$word);
        }
        //wait for all child processes to finish
        while ( pcntl_waitpid(0,$status) != -1){
            $status = pcntl_wexitstatus($status);
            echo "Child($status)  Exit.\n";
        }       
        $cost = time() -$start_time;
        echo "All tasks finished in $cost secends.\n";
         
    }
     
    protected function runProcess($file,$word)
    {
        $pid = pcntl_fork();
        if ($pid == -1 ){ //error occurs            
            echo "error occurs while fork new  task for [$file]\n";
            return false;
        }elseif($pid){ // parent process
            //echo "in parent process.\n";
             
        }else{//forked child
            $mypid = getmypid();
            echo "start child process $mypid to read file $file \n";
            $exitStatus = $mypid;
            findInFile($file,$word);                
            exit($exitStatus);
        }
        return true;
    }
 
     
}
function findInFile($file,$word){
    echo "start searching $word in $file \n";
    $fp = fopen($file,'rb');
    $i = 0;
    while (!feof($fp)){
        $i ++;
        $line = fgets($fp);
        if ( strstr($line,$word) ){                 
            echo "$file [$l]: $line \n ";
        }
    }
    fclose($fp);
    echo "Finish searching in $file \n ";
}
 
$obj = new Searcher();
$obj->runMultiProcess('张三');

进程是个比较烦琐的东西,什么信号啊之类的是最难掌握的,更重要的是各进程之间相互通讯比较困难。

现在再试试基于pthreads扩展的多线程

<?php
class Searcher{
    protected $fileList = [];
         
    public function __construct()
    {
        $this->currentTasks[0] = 'main';
        $dir = dir( dirname(__FILE__).'/2000W' );
        while ( ($file = $dir->read() )!=FALSE ){
            if ( strpos($file,'.csv') >0 ){
                $this->fileList[] = dirname(__FILE__).'/2000W/' . $file;
            }
        }
        $dir->close();           
    }   
     
    public function runMultiThread($word)
    {
        $start_time = time();
        $ths = [];
        foreach($this->fileList as $i=>$file){
            $ths[$i] = new ThSearch($file,$word);
            $ths[$i]->start();
        }
        foreach($ths as $th){
            $th->join();
        }
        echo "Finished in ", time()-$start_time , "seconds.\n";
    }
         
}
function findInFile($file,$word){
    echo "start searching $word in $file \n";
    $fp = fopen($file,'rb');
    $i = 0;
    while (!feof($fp)){
        $i ++;
        $line = fgets($fp);
        if ( strstr($line,$word) ){                 
            echo "$file [$l]: $line \n ";
        }
    }
    fclose($fp);
    echo "Finish searching in $file \n ";
}
 
class ThSearch extends Thread{
    protected $file;
    protected $word;
    public function __construct($file,$word)
    {
        $this->file = $file;
        $this->word = $word;     
    }
    public function run()
    {
        findInFile($this->file,$this->word);
    }
}
 
$obj = new Searcher();
$obj->runMultiThread('张三');

线程相比进程要轻得多,而且也更好控制。

也许你会发现这三种方法性能为什么都差不多? 因为在这个例子中,性能的瓶颈主要在磁盘IO,硬盘读写上限就那么多,自然性能也相差无几了。
from https://gebsni.com/a/13/10/how-to-search-person-from-200-million-data.html

标签: PHP, 实例

添加新评论