Downloading files : A Comparative study

The idea :

A simple case study of downloading files in perl, ruby, python and php.

The algorithm :

Assumption : The input for these scripts are obtained from a simple cgi form.

  • 0. Hit an URL which in turn responds with a list of URLs ( Like RealMedia Metafiles )
  • 1. Download the contents of each URL that the is returned from the master.
  • 2. Zip all the downloaded files and exit.

Downloading files in perl :

#!/usr/bin/perl -w
use strict;
use warnings; 
use CGI qw(:standard);
use LWP::Simple qw(!head);
use File::Basename;
use Archive::Zip;

# URL from the input form.
my $url = param('url');

# Validate the input
if(defined $url and head($url))
{
    # Get the ablum name,
    # in this case the second last part.  
    my @pieces = (split /\//,$url);
    my $album = $pieces[-2];
    my @songs = ();
    
    # Fetch the urls
    my $urls = get($url);
   
    # For each url in the url list.
    foreach my $url (split /\n/, $urls) {
    
        # Push the song_names to songs 
        push(@songs,basename($url));
       
        # The major part of downlaading.
        getstore($url,basename($url));
    }   
    
    my $compressor = Archive::Zip->new();
    
    # For each song that was pushed before.
    foreach my $song (@songs)
    {
        # Add them to the zip archive.    
        $compressor->addFile($song);
    }
    
    # Write the zip file.
    $compressor->writeToFileNamed($album.'.zip');
}
else
{
 print "<b> Please check the URL! </b>";
}

Downloading files in ruby :

#!/usr/bin/ruby
require 'uri'

urls = cgi.params['url'].to_s

if (!(urls =~ URI::regexp).nil?)

   album = urls.split('/') [-2] 
   songs = []

   open(urls).read.each_line do |url|
      song = url.split("/")[-1].chomp
      Zip::ZipFile.open(album+'.zip', Zip::ZipFile::CREATE) {
        |zipit|
        zipit.get_output_stream(song) { |f| f.puts open(url.chomp).read }
   } 
   end
else
   puts "Please check the input!"
end

Downloading files in python:

import cgi
import cgitb 
import urllib
import zipfile

cgitb.enable()

form = cgi.FieldStorage()
url = form.getvalue("url", "None")

if url != "None":
    type = url [ -3: ]
    album = url.split('/')[-2]
    try:
        urls = urllib.urlopen(url)  
    except IOError:
        print "Please check the URL"
        exit

    songs = []
    
    for url in urls:
    song = url.split("/")[-1]
        urllib.urlretrieve(url,song)
    songs.append(song)

    zipit = zipfile.ZipFile(album+'.zip', 'w')    

    for song in songs:
        zipit.write(song)

Downloading files in PHP:

<?php

if (isset($params['host']) && $params['host'])      $header[]="Host: " . $params['host'];

/* creates a compressed zip file */
function create_zip($files = array(),$destination = '',$overwrite = true) {
    
    //avoid over write
    if(file_exists($destination) && !$overwrite) { return false; }
    
    $valid_files = array();
    
    if(is_array($files)) {
        // check each file
        foreach($files as $file) {
            if(file_exists($file)) {
                $valid_files[] = $file;
            }
        }
    }
    
    if(count($valid_files)) {
        //create the archive
        $zip = new ZipArchive();
        if($zip->open($destination,$overwrite ? ZIPARCHIVE::OVERWRITE : ZIPARCHIVE::CREATE) !== true) {
            return false;
        }
        //add the files
        foreach($valid_files as $file) {
            $zip->addFile($file,$file);
        }
        $zip->close();
        
        return file_exists($destination);
    }
    else
    {
        return false;
    }
}

function get_data($url)
{
    $url = chop($url);
    $ch = curl_init();
    $timeout = 5;
    curl_setopt($ch,CURLOPT_URL,$url);
    curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
    curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,$timeout);
    $data = curl_exec($ch);
    curl_close($ch);
    return $data;
}

function isValidURL($url)
{
return preg_match('|^http(s)?://[a-z0-9-]+(.[a-z0-9-]+)*(:[0-9]+)?(/.*)?$|i', $url);
}

if(isset($_POST['submit']))
{
    $url = chop($_POST['url']);
    $songs = array();
   
    $pieces = explode('/',parse_url($url, PHP_URL_PATH)); 
    $album  = $pieces[count($pieces)-2];
    
    if(isValidURL($url) and substr(trim($url), -4) === ".php")
    {
    $url = str_replace(".php","/All.ram", $url);
        $url = str_replace("/home","",$url);
    $album = $pieces[count($pieces) - 1];
    $album = str_replace(".php","",$album);
    }
   
    if(!isValidURL($url) or substr(trim($url), -4) != ".ram"){
        echo "<p>Error! Please check the URL guru!</p>";
    }
 

    else{
    $returned_content = get_data($url);
    foreach(explode("\n", get_data($url)) as $dload){
        $song=rtrim(end(explode("/",$dload)));
        if(!empty($song))
        {
            array_push($songs,$song);
            file_put_contents($song,get_data($dload)); 
        }
    }
    $result = create_zip($songs,$album.'.zip');
   
    }
}

?>

The average runtime ( 3 runs ) on a 3MB/s line for a file size of 13MB was :

Language Time in secs
PHP 11.49
Perl 9.0
Ruby 17.22
Python 0.24

All said and done, a simple wget -i took :
real 0m5.231s
user 0m0.012s
sys 0m0.131s

P.S : This was a very specific case, this can't be a generic benchmark, there are always better way of tuning the code above!

Share this