;+
; WARNING: the interface to this routine is not yet solidified. Use the wrapper routine:
; file_retrieve instead. This routine is still under development.
;
; NAME:
; file_http_copy
;
; PURPOSE:
; Downloads file(s) from http servers.
; Also performs Searches without download.
; Copies the file to a user specified local directory.
; By default, files are only downloaded if the remote file is newer than
; the local file (based on mtime) or if the files differ in size.
; This routine is intended for use with simple HTTP file servers.
; Wildcard matching and recursive file searching can be used as well.
;
; CALLING SEQUENCE: There are two methods:
; Method 1:
; FILE_HTTP_COPY, pathnames, SERVERDIR=serverdir, LOCALDIR=localdir
; where:
; pathnames = (input string(s), scalar or array) Relative path name of file to download.;
; serverdir = (scalar input string) Root name of source URL, must
; begin with: 'http://' and end with '/'
; localdir = (scalar input string) Root name of local directory, typically
; ends with '/'
; Note: The source is at: serverdir + pathnames
; The destination is: localdir + pathnames
; Method 2:
; FILE_HTTP_COPY, URL
; URL = full URL(S) of source file
; Directory structure is not retained with this procedure
;
; Example:
; FILE_HTTP_COPY, 'ssl_general/misc/file_http_copy.pro', $
; SERVERDIR='http://themis.ssl.berkeley.edu/data/themis/socware/bleeding_edge/idl/' $
; localdir = 'myidl/'
;
; Note: Unix style directory separaters '/' should be used throughout. This convention will still
; work with WINDOWS.
;
; Alternate calling sequence:
; FILE_HTTP_COPY,URL
; where URL is an input (string) such as: URL = '
;
; INPUTS:
; URL - scalar or array string giving a fully qualified url
;
; OPTIONAL KEYWORDS:
; NO_CLOBBER: (0/1) Set this keyword to prevent overwriting local files.
; NO_UPDATE: (0/1) Set this keyword to prevent contacting the remote server to update existing files. Ignored with directory lists
; IGNORE_FILESIZE: (0/1) Set this keyword to ignore file size when
; evaluating need to download.
; NO_DOWNLOAD: (0/1,2) Set this keyword to prevent file downloads (url_info
; is still returned)
; URL_INFO=url_info: (output) Named variable that returns information about
; remote file such as modification time and file size as determined
; from the HTML header. A zero is returned if the remote file is
; invalid.
; FILE_MODE= file_mode: If non-zero, sets the permissions for downloaded files.
; DIR_MODE = dir_mode: Sets permissions for newly created directories
; (Useful for shared directories)
; ASCII_MODE: (0/1) When set to 1 it forces files to be downloaded as ascii text files (converts CR/LF)
; Setting this keyword will force ignore_filesize keyword to be set as well because
; files will be of different sizes typically.
; USER_PASS: string with format: 'user:password' for sites that require Basic authentication. Digest authentication is not supported.
; VERBOSE: (input; integer) Set level of verboseness: Uses "DPRINT"
; 0-nearly silent; 2-typical messages; 4: debugging info
; PRESERVE_MTIME: Uses the server modification time instead of local modification time. This keyword is ignored
; on windows machines that don't have touch installed. (No cygwin or GNU utils)
; Note: The PRESERVE_MTIME option is experimental and highly platform
; dependent. Behavior may change in future releases, so use with
; caution.
;
;
; Examples:
; ;Download most recent version of this file to current directory:
; FILE_HTTP_COPY,'http://themis.ssl.berkeley.edu/data/themis/socware/bleeding_edge/idl/ssl_general/misc/file_http_copy.pro'
;
; OPTIONAL INPUT KEYWORD PARAMETERS:
; PATHNAME = pathname ; pathname is the filename to be created.
; If the directory does not exist then it will be created.
; If PATHNAME does not exist then the original filename is used
; and placed in the current directory.
;;
; RESTRICTIONS:
;
; PROXY: If you are behind a firewall and have to access the net through a
; Web proxy, set the environment variable 'http_proxy' to point to
; your proxy server and port, e.g.
; setenv, 'http_proxy=http://web-proxy.mpia-hd.mpg.de:3128'
; setenv, 'http_proxy=http://www-proxy1.external.lmco.com'
;
; The URL *MUST* begin with "http://".
;
; PROCEDURE:
; Open a socket to the webserver and download the header.
;
; EXPLANATION:
; FILE_HTTP_COPY can access http servers - even from behind a firewall -
; and perform simple downloads. Currently,
; Requires IDL V5.4 or later on Unix or Windows, V5.6 on
; Macintosh
;
; EXAMPLE:
; IDL> FILE_HTTP_COPY,'http://themis.ssl.berkeley.edu/themisdata/thg/l1/asi/whit/2006/thg_l1_asf_whit_2006010103_v01.cdf'
; IDL> PRINTDAT, file_info('thg_l1_asf_whit_2006010103_v01.cdf')
; or
;
;
; MINIMUM IDL VERSION:
; V5.4 (uses SOCKET)
; MODIFICATION HISTORY:
; Original version: WEBGET()
; Written by M. Feldt, Heidelberg, Oct 2001 <mfeldt@mpia.de>
; Use /swap_if_little_endian keyword to SOCKET W. Landsman August 2002
; Less restrictive search on Content-Type W. Landsman April 2003
; Modified to work with FIRST image server- A. Barth, Nov 2006
; FILE_HTTP_COPY: New version created by D Larson: March 2007.
; Checks last modification time of remote file to determine need for download
; Checks size of remote file to determine need to download
; Very heavily modified from WEBGET():
; May/June 2007 - Modified to allow file globbing (wildcards).
; July 2007 - Modified to return remote file info (without download)
; July 2007 - Modified to allow recursive searches.
; August 2007 - Added file_mode keyword.
; April 2008 - Added dir_mode keyword
; Sep 2009 - Fixed user-agent
;
; $LastChangedBy: davin-mac $
; $LastChangedDate: 2014-12-06 11:08:02 -0800 (Sat, 06 Dec 2014) $
; $LastChangedRevision: 16363 $
; $URL: svn+ssh://thmsvn@ambrosia.ssl.berkeley.edu/repos/spdsoft/trunk/general/misc/file_http_copy.pro $
;-
; Function encode_url replaces
; the HTML URL encoding that starts with a % sign
; with regular charecters
function encode_url, urln
; The list of posible URL encodings is very long,
; here we only include a few cases only
; This list should be expanded if needed
; Full list can be found at http://en.wikipedia.org/wiki/Percent-encoding
str_replace, urln, "%24", "$"
str_replace, urln, "%26", "&"
str_replace, urln, "%27", "'"
str_replace, urln, "%3F", "?"
str_replace, urln, "%21", "!"
str_replace, urln, "%20", " "
return, urln
end
; Function compare_urls compares two URLs and returns 1 if the are the same, 0 otherwise
function compare_urls, url1, url2
result = 0
;_tmp, prevent modifying variable in the parent routine
url1_tmp = encode_url(url1)
url2_tmp = encode_url(url2)
url1_tmp = strtrim(strlowcase(url1_tmp),2)
url2_tmp = strtrim(strlowcase(url2_tmp),2)
result = strcmp(url1_tmp, url2_tmp)
return, result
end
;deprecated, see extract_html_links_regex pcruce 2013-04-09
pro extract_html_links,s,links, $ ; input: string ; output: links appended to array
relative=relative,$ ; Set to return only relative links
normal=normal ; Set to return only normal links (without ? or *)
;compile_opt idl2,hidden
p0 = strpos(strlowcase(s),'<a href="')
if p0 ge 0 then begin
p1 = strpos(s,'">',p0)
if p1 ge p0+9 then begin
link = strmid(s,p0+9,p1-p0-9)
bad = strlen(link) eq 0
if keyword_set(normal) then bad = (strpos(link,'?') ge 0) or bad
if keyword_set(normal) then bad = (strpos(link,'*') ge 0) or bad
if keyword_set(relative) then bad = (strpos(link,'/') eq 0) or bad ; remove absolute links (which start with '/')
if not bad then links = [links,link]
endif
endif
end
;Procedure: extract_html_links_regex
;Purpose: subroutine to parse <a>(link) tags from html.
;It is exclusively used to parse .remote-index.html files byt file_http_copy.
;
;The _regex version of this routine is replacing the original version because
;the old version made assumptions about the formatting of the .remote-index.html file
;that were dependent upon the type of web server that was producing the file. We think that
;these bugs took so long to show up because Apache servers are extremely common.
;Modification prompted so that file_http_copy can work more reliably rbspice & rb-emfisis
;New version:
;#1 Handles html that doesn't place the href attribute exactly one space after the link tag
;#2 Handles cases where the server doesn't include newlines, or where multiple links are included per line of returned html by the server
;
;Inputs:
;s: An html string to be parsed.
;links: An empty string, or an array of strings from previous extract operation
;
;Outputs:
;links: extracted links are concatenated onto the links argument provided as input and returned through this argument
;
pro extract_html_links_regex,s,links, $ ; input: string ; output: links appended to array
relative=relative,$ ; Set to strip out everything but the filename from a link
normal=normal,$ ; Set to links that don't have '*' or '?' (don't think this should every actually happen, but option retained just in case.
no_parent_links=no_parent_links ;Set to the parent domain to automatically exclude backlinks to the parent directory
compile_opt idl2,hidden
s_copy = s ;prevent modification of string
;this regex is a little tricky, most of the complexity is to prevent it from matching two links when it should match one
;e.g. It could match <a href="blah"></a><a href="blah"></a> instead of <a href="blah"> (matching between the first <a and the last >, rather than first & first)
if keyword_set(normal) then begin
;match a string containing the following in order
;#1 "<a "
;#2 zero or more characters that are not '<' or '>'
;#3 "href="
;#4 '"' (quotation mark)
;#5 0 or more characters that are not '"' '*' or '?'
;#6 '"' (quotation mark)
;#7 0 or more characters that are not '<' or '>'
;#8 The '>' character
;
;Other notes:
;#1 The () are not a part of the pattern. They indicate that anything matching inside the parentheses is a captured sub-expression
link_finder_regex='<a [^>^<]*href="([^"^*^?]*)"[^<^>]*>'
endif else begin
;match a string containing the following in order
;#1 "<a "
;#2 zero or more characters that are not '<' or '>'
;#3 "href="
;#4 '"' (quotation mark)
;#5 0 or more characters that are not '"'
;#6 '"' (quotation mark)
;#7 0 or more characters that are not '<' or '>'
;#8 The '>' character
;
;Other notes:
;#1 The () are not a part of the pattern. They indicate that anything matching inside the parentheses is a captured sub-expression
link_finder_regex='<a [^>^<]*href="([^"]*)"[^<^>]*>'
endelse
;/subexp indicates that everything inside the () of the regex should be returned in the results so that they can be extracted
pos = stregex(s_copy,link_finder_regex,/subexp,length=length,/fold_case)
while pos[1] ne -1 do begin
link = strmid(s_copy,pos[1],length[1]) ; remove a copy of the link from the string
s_copy = strmid(s_copy,pos[0]+length[0]) ; remove link from string, so that we can process the next string
;exclude parent links, if keyword set and domain provided
if n_elements(no_parent_links) gt 0 then begin
if file_http_is_parent_dir(no_parent_links,link) then begin
link = ''
endif
endif
if keyword_set(relative) then begin
;match a string containing the following in order
;#1 a "/"
;#2 one or more characters that are not "/"
;#3 one or more "/" characters
;#4 the end of the string
rel_pos = stregex(link,'/[^/]+/?$',/fold_case)
if rel_pos[0] ne -1 then begin
link = strmid(link,rel_pos+1)
endif
endif
if strlen(link) gt 0 then begin
if strlen(links[0]) gt 0 then begin
links = [links,link]
endif else begin
links = [link]
endelse
endif
pos = stregex(s_copy,link_finder_regex,/subexp,length=length,/fold_case)
endwhile
end
;FUNCTION file_extract_html_links(filename,count)
;PURPOSE: returns links within an html file on disk.
;Used by file_http_copy to extract link tags from locally cached version of .remote-index.html files.
;INPUT: filename: (string) valid filename
;OUTPUT: count: number of links found
function file_extract_html_links,filename,count,verbose=verbose,no_parent_links=no_parent_links ; Links with '*' or '?' or leading '/' are removed.
count=0 ; this should only return the relative links.
on_ioerror, badfile
openr,lun,filename,/get_lun
s=''
links = ''
while not eof(lun) do begin
readf,lun,s
; The REGEX version of this code typically takes about 2.5 times longer to run than the older code - is there a way to avoid having to use REGEX?
;extract_html_links,s,links,/relative,/normal ;deprecated, see extract_html_links_regex pcruce 2013-04-09
extract_html_links_regex,s,links,/relative,/normal,no_parent_links=no_parent_links
endwhile
free_lun,lun
bad = strlen(links) eq 0
w = where(bad eq 0,count)
; if count ne 0 then begin
; links = links[sort(links)]
; endif
dprint,verbose=verbose,dlevel=3,'Extracted '+strtrim(count,2)+' links from: '+filename
return,count gt 0 ? links[w] : ''
badfile:
dprint,dlevel=1,verbose=verbose,'Bad file: '+filename
return,''
end
; Function: file_http_strip_domain
; Purpose: removes the domain(http://domain.whatever/) from html link, if present. Otherwise, returns string unmodified
; Inputs:
; s: The string to have domain removed
; Returns:
; s: with domain removed
function file_http_strip_domain,s
compile_opt idl2,hidden
;match a string containing the following in order
;#1 the beginning of the string
;#2 "http://"
;#2 followed by one or more characters that are not "/"
;#3 followed by one "/"
;#4 followed by 0 or more characters of any type
m = stregex(s,"^http://[^/]+/",length=l,/fold_case)
if m[0] ne -1 then begin
return, strmid(s,l)
endif else begin
return, s
endelse
end
;Function: file_http_is_parent_dir
;Purpose: predicate function, checks whether the provided link is a parent to the current directory
;Inputs:
; Current: Set to the full url for the current directory
; Link: The link to be checked
;Returns:
; 1: if link is to current's parent
; 0; if link is not to current's parent
function file_http_is_parent_dir,current,link
compile_opt idl2,hidden
if n_elements(link) eq 0 then return,0
if strlen(link) eq 0 then return,0
;match a string containing the following in order
;#1 the contents of the variable "link"
;#2 one or more characters that are not "/"
;#3 the "/" character
;#4 the end of the string
;Other notes:
;#1 link will always end in "/" if it is a directory. So there is no need to specify it in the regex
;#2 strip domain will always return a string that does not begin with a "/" (relative link), so we add it back in
return,stregex("/"+file_http_strip_domain(current),escape_string(link)+"[^/]+/$",/boolean,/fold_case)
end
function file_http_header_element,header,name
res = strcmp(header,name,strlen(name),/fold_case)
g = where(res, Ng)
if Ng GT 0 then return,strmid(header[g[0]],strlen(name)+1)
return,''
end
pro file_http_header_info, Header, hi, verbose=verbose
;;
;; MIME type recognition
;
; hi.url = url
if strmid(hi.url,0,1,/reverse_offset) eq '/' then hi.directory=1
hi.ltime = systime(1)
if not keyword_set(header) then return ;,hi
hi.status_str = header[0]
header0 = strsplit(/extract,header[0],' ')
hi.status_code = fix( header0[1] )
; get server time (date)
date = file_http_header_element(header,'Date:')
if keyword_set(date) then hi.atime = str2time(date, informat = 'DMYhms') else hi.atime = hi.ltime
hi.clock_offset = hi.atime - hi.ltime
dprint,dlevel=6,verbose=verbose,'date=',date
; Look for successful return
pos = strpos(strupcase(header[0]),'200 OK')
hi.exists = hi.status_code eq 200 || hi.status_code eq 304
; if hi.exists eq 0 then return
hi.class = 'text'
hi.type = 'simple' ; in case no information found...
hi.Content_Type = file_http_header_element(header,'Content-Type:')
if keyword_set(hi.Content_Type) then begin
hi.Class = (strsplit(hi.Content_Type, '/', /extract))[0]
hi.Type = (strsplit(hi.Content_Type, '/', /extract))[1]
ENDIF
; get file modification time
last_modified = file_http_header_element(header,'Last-Modified:')
hi.mtime = keyword_set(last_modified) ? str2time(last_modified, informat = 'DMYhms') : systime(1)
dprint,dlevel=4,verbose=verbose,'last_modified=',last_modified
; Try to determine length
len = file_http_header_element(header,'Content-Length:')
if keyword_set(len) then hi.size = long64(len) else hi.size = -1
return ;,hi
END
;subsequent block subsumed in more general solution, extract_html_links_regex (pcruce 2013-04-09)
; Function strip_sub_pathnames_from_link, link, sub_pathname, end_pathname
; ;Kludge for links that come up with 'http' at the start, note that we
; ;do this here and not in extract_html_links, because we are not sure
; ;how the change will interact with the rest of the program. jmm,
; ;11-oct-2012.
; ;The appropriate link will be that which exists between sub_pathname
; ;and end_pathname. Note that nothing happens if the link does not
; ;begin with 'http'.
; if(strmid(link, 0, 4) Ne 'http') then return, link
; link_out = link
; l_sub_pathname = strlen(sub_pathname) ;find the position of sub_pathname and end_pathname in the link
; l_end_pathname = strlen(end_pathname)
; if(l_sub_pathname Eq 0) then begin ;if there is no sub_pathname, then use file_basename
; link_out = file_basename(link)
; endif else begin
; pos_sub_pathname = strpos(link, sub_pathname)
; if(pos_sub_pathname Ne -1) then begin
; link_out = strmid(link, pos_sub_pathname+l_sub_pathname)
; endif else link_out = file_basename(link)
; endelse
; if(strlen(link_out) gt 0) then begin ;un-append end_pathname, if needed
; if(l_end_pathname gt 0) then begin
; pos_end_pathname = strpos(link_out, end_pathname)
; if(pos_end_pathname Ne -1) then begin
; if(pos_end_pathname eq 0) then link_out = '' $
; else link_out = strmid(link_out, pos_end_pathname-1)
; endif
; endif
; endif
;
; Return, link_out
; End
PRO file_http_copy, pathnames, newpathnames, $
recurse_limit=recurse_limit, $
verbose=verbose, $ ; input: (integer) Set level of verbose output. (2 is typical)
serverdir=serverdir, $ ; input: (string) URL of source files: ie: 'http://themis.ssl.berkeley.edu/data/themis/' ;trailing '/' is required
localdir=localdir, $ ; input: (string) destination directory i.e.: 'e:/data/themis/' ;trailing '/' is required
localnames=localname, $ ; output: Downloaded filenames are returned in this variable
file_mode=file_mode, $ ; input: if non-zero, file permissions are set to this value. (use '666'o for shared files.)
dir_mode=dir_mode, $ ; input: defines directory permissions for newly created directories (use '777'o for shared directories)
last_version=last_version, $
min_age_limit=min_age_limit, $
host=host, $ ;input string: Used to define HOST in HTTP header
user_agent=user_agent, $ ; input string: Used to define user_agent in HTTP message.
user_pass=user_pass, $
preserve_mtime=preserve_mtime,$ ; EXPERIMENTAL, highly platform dependent
restore_mtime=restore_mtime, $ ; EXPERIMENTAL, highly platform dependent
if_modified_since = if_modified_since, $
ascii_mode = ascii_mode, $ ; input (0/1) Set this keyword to force downloaded files to be downloaded as ascii Text. (converts CR/LFs)
no_globbing=no_globbing, $
no_clobber=no_clobber, $ ; input: (0/1) set keyword to prevent overwriting existing files. (url_info is still returned however)
archive_ext=archive_ext, $ ; input: set to ".ARC??" to rename older files instead of deleting them.
archive_dir=archive_dir, $ ; input: set to "archive/" to move older files to sub directory archive/.
no_update=no_update, $ ; input: (0/1) set keyword to prevent contacting remote server if file already exists. Ignored if globbing is used. (NOT YET OPERATIONAL!)
no_download=no_download, $ ; input: (0/1/2) set keyword to prevent downloading files, Useful to obtain url_info only. Set to 2 to get names only.
ignore_filesize=ignore_filesize, $ ; input: (0/1) if set then file size is ignored when evaluating need to download.
ignore_filedate=ignore_filedate, $ ; NOT YET OPERATIONAL! input: (0/1) if set then file date is ignored when evaluating need to download.
url_info=url_info_s, $ ; output: structure containing URL info obtained from the HTTP Header.
progobj=progobj, $ ; This keyword is experimental - please don't count on it
links=links2, $ ; Output: links are returned in this variable if the file is an html file
force_download=force_download, $ ;Allows download to be forced no matter modification time. Useful when moving between different repositories(e.g. QA and production data)
error = error
;;
;;
;; sockets supported in unix & windows since V5.4, Macintosh since V5.6
tstart = systime(1)
dprint,dlevel=5,verbose=verbose,'Start; $Id: file_http_copy.pro 16363 2014-12-06 19:08:02Z davin-mac $'
request_url_info = arg_present(url_info_s)
url_info_s = 0
;dprint,dlevel=3,verbose=verbose,no_url_info,/phelp
if not keyword_set(localdir) then localdir = ''
if not keyword_set(serverdir) then serverdir = ''
for pni=0L,n_elements(pathnames)-1 do begin
localname=''
links2 = ''
url_info = {http_info, $
url:'', $ ; Full url of file
io_error: 0, $
localname:'', $ ; local file name
status_str:'', $
status_code:0, $
content_type:'', $
type:'', $ ; type of file
class:'', $ ; Class
exists: 0b, $
directory: 0b, $
ltime: 0ll, $ ; Time when procedure was run
atime: 0ll, $ ; server time at time of last access
mtime: 0ll, $ ; last mod time of file
clock_offset: 0ll, $ ; difference between server time and local time
size: 0ll $
}
;url_info = 0
if keyword_set(serverdir) then begin
pathname = pathnames[pni]
url = serverdir+pathname
endif else begin
url = pathnames[pni]
pathname = file_basename(url)
if strmid(url,0,1,/reverse_offset) eq '/' then pathname += '/' ;add the '/' back on for directories
endelse
cgi_bin = strpos(url,'cgi-bin') gt 0
if keyword_set(newpathnames) then begin
no_globbing=1
newpathname = newpathnames[pni]
endif else newpathname = pathname
if cgi_bin && n_elements(no_globbing) eq 0 then no_globbing=1
url_info.url = url
url_info.ltime = systime(1)
dprint,dlevel=6,verbose=verbose,/phelp,serverdir
dprint,dlevel=6,verbose=verbose,/phelp,localdir
dprint,dlevel=6,verbose=verbose,/phelp,pathname
dprint,dlevel=6,verbose=verbose,/phelp,newpathname
dprint,dlevel=6,verbose=verbose,/phelp,url
indexfilename = '.remote-index.html'
globpos = min( uint( [strpos(pathname,'*'),strpos(pathname,'?'),strpos(pathname,'['),strpos(pathname,']')] ) )
;if using globbing, then read the server remote index file and extract the links
if (~ keyword_set(no_globbing)) && globpos le 1000 then begin ; Look for globbed ([*?]) filenames
dprint,dlevel=4,verbose=verbose,'Warning! Using Globbing!'
slash='/'
slashpos1 = strpos(pathname,slash,globpos,/reverse_search)
sub_pathname = strmid(pathname,0,slashpos1+1)
dprint,dlevel=5,verbose=verbose,/phelp,sub_pathname
; First get directory listing and extract links: (listing will not be archived)
file_http_copy,sub_pathname,serverdir=serverdir,localdir=localdir,url_info=index, host=host ,ascii_mode=1 $
,min_age_limit=min_age_limit,verbose=verbose,file_mode=file_mode,dir_mode=dir_mode,if_modified_since=if_modified_since $
, links=links, user_agent=user_agent ,user_pass=user_pass,error=error ;, no_update=no_update ;,preserve_mtime=preserve_mtime, restore_mtime=restore_mtime
if keyword_set(error) then begin
dprint,dlevel=1,verbose=verbose,'Error detected ',error
goto, final_quit
endif
dprint,dlevel=5,verbose=verbose,/phelp,links
;strip out return directory links
; if n_elements(links) gt 1 then begin
; links = [1:n_elements(links)-1]
; endif
slashpos2 = strpos(pathname,slash,globpos)
if slashpos2 eq -1 then slashpos2 = strlen(pathname) ; special case for non-directories (files)
sup_pathname = strmid(pathname,0,slashpos2+1)
end_pathname = strmid(pathname,slashpos2+1)
;subsequent block subsumed in more general solution, extract_html_links_regex (pcruce 2013-04-09)
;If links start with 'http', strip out the unexpected parts
; if(n_elements(links) gt 0) then begin
; for j = 0, n_elements(links)-1 do $
; links[j] = strip_sub_pathnames_from_link(links[j], sub_pathname, end_pathname)
; endif
w = where(strmatch(sub_pathname+links,sup_pathname),nlinks)
if nlinks gt 0 then begin
w = w[sort(links[w])] ; sort in alphabetical order (needed for last_version keyword)
dprint,dlevel=5,verbose=verbose,links[w],/phelp
rec_pathnames = sub_pathname + links[w] + end_pathname
dprint,dlevel=5,verbose=verbose,/phelp,sup_pathname
dprint,dlevel=5,verbose=verbose,/phelp,end_pathname
dprint,dlevel=5,verbose=verbose,/phelp,rec_pathnames
if keyword_set(last_version) then i0 = nlinks-1 else i0=0L
for i=i0,nlinks-1 do begin
dprint,dlevel=3,verbose=verbose,'Retrieve link#'+strtrim(i+1,2)+' of '+strtrim(nlinks,2)+': '+ rec_pathnames[i]
; Recursively get files:
file_http_copy,rec_pathnames[i],serverdir=serverdir,localdir=localdir, host=host $
, verbose=verbose,file_mode=file_mode,dir_mode=dir_mode, ascii_mode=ascii_mode $
, min_age_limit=min_age_limit, last_version=last_version, url_info=ui $
, no_download=no_download, no_clobber=no_clobber, no_update=no_update, archive_ext=archive_ext, archive_dir=archive_dir $
, force_download=force_download $
, ignore_filesize=ignore_filesize,user_agent=user_agent,user_pass=user_pass,if_modified_since=if_modified_since $
, preserve_mtime=preserve_mtime, restore_mtime=restore_mtime
; dprint,dlevel=5,verbose=verbose,/phelp,lns
if not keyword_set(ui) then message,'URL info error'
w = where(ui.exists ne 0,nw)
if nw ne 0 then uis = keyword_set(uis) ? [uis,ui[w]] : ui[w] ; only include existing files
dprint,dlevel=5,verbose=verbose,/phelp,localname
endfor
if keyword_set(uis) then url_info = uis
endif else begin
dprint,dlevel=3,verbose=verbose,'No files found matching: '+sup_pathname
endelse
goto, final
endif ; End of globbed filenames
; Begin normal file downloads
localname = localdir + newpathname
if strmid(url,0,1,/reverse_offset) eq '/' then begin ; Directories
url_info.directory = 1
localname = localname + indexfilename
endif
lcl = file_info(localname)
if keyword_set(no_download) && no_download eq 2 then begin
dprint,dlevel=4,verbose=verbose,'Warning: URL_INFO is not valid for: "'+url+'"'
url_info.localname = localname
url_info.exists = -1 ; remote existence is not known!
goto, final
endif
if keyword_set(no_update) && lcl.exists then begin
dprint,dlevel=3,verbose=verbose,'Warning: Updates to existing file: "'+lcl.name+'" are not being checked!'
url_info.localname = localname
url_info.exists = -1 ; remote file existence is not known!
if arg_present(links2) then begin
links2 = file_extract_html_links(localname,verbose=verbose,no_parent=url) ; Does this belong here? this might be producing unneeded work
endif
goto, final
endif
if lcl.exists eq 1 and lcl.write eq 0 then begin
dprint,dlevel=2,verbose=verbose,'Local file: '+lcl.name+ ' exists and is write protected. Skipping.'
url_info.localname = localname
url_info.exists = -1 ; existence is not known!
if arg_present(links2) then begin
links2 = file_extract_html_links(localname,verbose=verbose,no_parent=url) ; Does this belong here?
endif
goto, final
endif
;Warning: The file times (mtime,ctime,atime) can be incorrect (with Windows) if the user has (recently) changed the time zone the computer resides in.
;This can lead to unexpected results.
file_age = tstart-lcl.mtime
if file_age lt (keyword_set(min_age_limit) ? min_age_limit : 0) then begin
dprint,dlevel=3,verbose=verbose,'Found recent file ('+strtrim(long(file_age),2)+' secs): "'+localname+'" (assumed valid)'
;url_info.ltime = systime(1)
url_info.localname = localname
url_info.exists = 1
if arg_present(links2) then begin
links2 = file_extract_html_links(localname,verbose=verbose,no_parent=url)
endif
goto, final
endif
;;
;; open the connection and request the file
;;
read_timeout = 30
connect_timeout = 10
t_connect = systime(1)
if n_elements(user_agent) eq 0 then user_agent = 'FILE_HTTP_COPY IDL'+!version.release + ' ' + !VERSION.OS + '/' + !VERSION.ARCH
stack = scope_traceback(/structure)
referrer = stack.routine + string(stack.line,format='("(",i0,")")')
referrer = strjoin(referrer,':')
; filemodtime = lcl.mtime
Proxy = getenv('http_proxy')
IF Proxy NE '' THEN BEGIN ; sort out proxy name
dprint,dlevel=1,verbose=verbose,'Using Proxy: ',proxy
Server = strmid(proxy, 7 )
Purl = url
ENDIF ELSE BEGIN ; Without proxy
slash1 = StrPos(strmid(url, 7, StrLen(url)), '/')
Server = StrMid(url, 7, slash1 )
purl = strmid(url,slash1+7, StrLen(url))
ENDELSE
lastcolon = strpos(server,':', /reverse_search)
if lastcolon gt 1 then begin
port = fix(strmid(server,lastcolon+1) )
server = strmid(server,0,lastcolon)
endif else port = 80
dprint,dlevel=4,verbose=verbose,'Opening server: "',server, '" on Port: ',port
if not keyword_set(server) then dprint,dlevel=0,verbose=verbose,'Bad server: "'+server+'"'
dprint,dlevel=5,verbose=verbose,'If IDL hangs soon after printing this statement then it could be a problem with VPN on some versions of MacOS'
socket, unit, Server, Port, /get_lun,/swap_if_little_endian,error=error,$
read_timeout=read_timeout,connect_timeout=connect_timeout
if error ne 0 then begin
If(n_elements(unit) Gt 0) Then free_lun, unit ;jmm, 19-jun-2007 for cases where unit is undefined
dprint,dlevel=0,verbose=verbose,!error_state.msg
if error eq -292 then dprint,dlevel=1,verbose=verbose,"It appears that the server "+server+" is down."
if error eq -291 then dprint,dlevel=1,verbose=verbose,"Do you need to set a proxy server? (i.e. setenv,'http_proxy=www.proxy-example.com')"
if error eq -290 then dprint,dlevel=1,verbose=verbose,"Are you connected to the internet?"
dprint,dlevel=2,verbose=verbose,'error code:',error,!error_state.code,' ',!error_state.sys_msg
goto, final_quit
endif
dprint,dlevel=4,verbose=verbose,'Purl= ',purl
printf, unit, 'GET '+purl + ' HTTP/1.0'
; aaflores july-2012 Allow HOST keyword to overwrite default value
if ~keyword_set(host) then host = server
; lphilpott may-2012 Add Host header to fix problem with site that have a permanent redirect
printf, unit, 'Host: ' + host
if keyword_set(user_agent) then begin
printf, unit, 'User-Agent: ',user_agent
dprint,dlevel=4,verbose=verbose,'User Agent: ',user_agent
endif
if size(/type,referrer) eq 7 then begin
printf, unit, 'Referer: ',referrer
dprint,dlevel=4,verbose=verbose,'Referer: ',referrer
endif
if keyword_set(user_pass) then begin
printf, unit, 'Authorization: Basic ',strpos(user_pass,':') ge 0 ? idl_base64(byte(user_pass)) : user_pass
dprint,dlevel=4,verbose=verbose,'USER_PASS: ',user_pass
endif
if keyword_set(if_modified_since) then begin
filemodtime = if_modified_since lt 2 ? lcl.mtime : if_modified_since
printf, unit, 'If-Modified-Since: ',time_string(filemodtime,tformat='DOW, DD MTH YYYY hh:mm:ss GMT')
dprint,dlevel=4,verbose=verbose,'If-Modified-Since: ',time_string(filemodtime,tformat='DOW, DD MTH YYYY hh:mm:ss GMT')
endif
printf, unit, ''
LinesRead = 0
text = 'XXX'
;;
;; now read the header
;;
On_IOERROR, done
Header = strarr(256)
WHILE text NE '' do begin
readf, unit, text
Header[LinesRead] = text
LinesRead = LinesRead+1
IF LinesRead MOD 256 EQ 0 THEN $
Header=[Header, StrArr(256)]
ENDWHILE
DONE: On_IOERROR, NULL
;;
if LinesRead EQ 0 then begin
free_lun, unit
url_info.io_error = 1
dprint,dlevel=0,verbose=verbose,!error_state.msg
goto, final
endif
Header = Header[0:LinesRead-1]
url_info.localname = localname
file_http_header_info,header,url_info,verbose=verbose
dprint,dlevel=4,verbose=verbose,'Server ',server,' Connect time= ',systime(1)-t_connect
dprint,dlevel=6,verbose=verbose,'Header= ',transpose(header)
dprint,dlevel=6,verbose=verbose,phelp=2,url_info
if url_info.status_code eq 401 then begin
realm = file_http_header_element(header,'WWW-Authenticate:')
prefix = keyword_set(user_pass) ? 'Invalid USER_PASS: "'+user_pass+'" for: '+realm : 'keyword USER_PASS required for: '+realm
dprint,dlevel=1,prefix+' Authentication Error: "'+url+'"'
goto , close_server
endif
; lphilpott may-2012 call redirect code for permanent redirects (301) in addition to temporary redirects
if url_info.status_code eq 302 || url_info.status_code eq 301 then begin ; Redirection
location = file_http_header_element(header,'Location:')
dprint,dlevel=1,verbose=verbose,'Redirection to: ',location
if keyword_set(location) then begin
if compare_urls(location, url_info.url) then begin ; if it redirects to self then exit
dprint,dlevel=1,verbose=verbose,'Error! Redirects to self: ',location
goto, close_server
endif else begin ; WARNING THIS SECTION OF CODE MIGHT BE INCOMPLETE BECAUSE RECURSIVE CALL IS MISSING MANY KEYWORDS !!!!
dprint,'Warning: Redirection may not work properly because not all keywords are set!'
; 2014-06-10 JWL
; Removed 'host' keyword from recursive call when resolving HTTP
; 301/302 redirections. It was erroneously sending the original
; 'host' parameter to the target of the redirection.
file_http_copy,location,keyword_set(newpathnames) ? newpathname : '', $
localdir=file_dirname(localdir+pathname)+'/',verbose=verbose, links=links2,$; lphilpott may-2012 change localdir so that the final directory the file is saved to is the one intended
;localdir=localdir,verbose=verbose, $
url_info=url_info,file_mode=file_mode,dir_mode=dir_mode, ascii_mode=ascii_mode, $
archive_ext=archive_ext, archive_dir=archive_dir, $
user_agent=user_agent,user_pass=user_pass, if_modified_since=if_modified_since ;,preserve_mtime=preserve_mtime,restore_mtime=restore_mtime
goto, close_server
endelse
endif
endif
if abs(url_info.clock_offset) gt 30 then $
dprint,dlevel=1,verbose=verbose,'Warning! Remote and local clocks differ by:',url_info.clock_offset,' Seconds'
if url_info.status_code eq 304 then begin ; Not modified since
dprint,dlevel=2,verbose=verbose,'Local file: ',localname,' is current'
goto, close_server
endif
if url_info.exists then begin
; Determine if download is needed
tdiff = (url_info.mtime - lcl.mtime) ; seconds old
MB = 2.^20
if lcl.exists then begin
download_file = 0
dprint,verbose=verbose,dlevel=4,'tdiff=',tdiff,' sec'
if tdiff gt 0 then begin
if keyword_set(no_clobber) then dprint,dlevel=1,verbose=verbose, format="('Warning! ',f0.1,' day old local file: ',a )", tdiff/24d/3600d, localname
download_file = 1
endif
if tdiff lt 0 and keyword_set(restore_mtime) then begin
; file_touch,exists=texists
dprint,dlevel=3,verbose=verbose,'File modification time mismatch. Restoring modification time. ',lcl.name
; if ~texists then begin
; dprint,verbose=verbose,dlevel=3 ,'Executable "touch" not found. Could not preserve_mtime'
; endif else $
if keyword_set(preserve_mtime) and lcl.size eq url_info.size then begin
file_touch,lcl.name,url_info.mtime,/mtime,/no_create,verbose=verbose ; ,toffset=time_zone_offset()
endif
endif
if (lcl.size ne url_info.size) && (~ keyword_set(ascii_mode)) then begin
if keyword_set(no_clobber) then $
dprint,dlevel=1,verbose=verbose,url_info.size/mb,lcl.size/mb, file_basename(localname), format='("Warning! Different file sizes: Remote=",f0.3," MB, Local=",f0.3," MB file: ",a)'
if not keyword_set(ignore_filesize) then download_file = 1
endif
if keyword_set(no_clobber) then download_file=0
endif else begin ; endof lcl.exists
download_file = 1
dprint,dlevel=3,verbose=verbose,format="('Found remote (',f0.3,' MB) file: ""',a,'""')",url_info.size/mb,url
endelse
if keyword_set(no_download) then download_file = 0
if keyword_set(force_download) then download_file = 1
if download_file then begin ; begin file download
dirname = file_dirname(localname)
file_mkdir2,dirname,mode = dir_mode,dlevel=2,verbose=verbose
On_IOERROR, file_error2
if file_test(localname,/regular,/write) then begin
if keyword_set(archive_ext) || keyword_set(archive_dir) then begin
file_archive,localname,archive_ext=archive_ext,archive_dir=archive_dir,verbose=verbose,dlevel=2
endif else begin
dprint,'Deleting old file: '+localname,dlevel=2,verbose=verbose
file_delete,localname,/allow_nonexistent
endelse
endif
openw, wunit, localname, /get_lun
ts = systime(1)
t0 = ts
if keyword_set(ascii_mode) || url_info.size lt 0 || strmid(url_info.type,0,4) eq 'html' then begin ; download text file (typically these are directory listings
dprint,dlevel=3,verbose=verbose,'Downloading "'+localname+'" as a text file.'
lines=0ul
while eof(unit) EQ 0 do begin
readf, unit, text
printf, wunit, text
if arg_present(links2) then extract_html_links_regex,text,links2 ,/relative, /normal,no_parent=url
dprint,dwait=10,dlevel=1,verbose=verbose,'Downloading "',localname,'" Please wait ', lines++
endwhile
dprint,dlevel=2,verbose=verbose,'Downloaded '+strtrim(lines,2)+' lines in '+string(systime(1)-ts,format='(f0.2)')+' seconds. File:'+localname
; if n_elements(links2) gt 1 then links2 = links2[1:*] ; get rid of first ''
endif else begin ; download Non-text (binary) files
maxb = 2l^20 ; 1 Megabyte default buffer size
nb=0l
b=0l
while nb lt url_info.size do begin
buffsize = maxb < (url_info.size-nb)
aaa = bytarr(buffsize,/nozero)
readu, unit, aaa
writeu, wunit, aaa
nb += buffsize
t1 = systime(1)
dt = t1-t0
b += buffsize
percent = 100.*float(nb)/url_info.size
if (dt gt 10.) and (nb lt url_info.size) then begin ; Wait 10 seconds between updates.
rate = b/mb/dt ; This will only display if the filesize (url_info.size) is greater than MAXB
eta = (url_info.size-nb)/mb/rate +t1 - tstart
messstr = string(format='(" ",f5.1," % (",f0.1,"/",f0.1," secs) @ ",f0.2," MB/s File: ",a)', percent, t1-tstart,eta, rate,file_basename(localname) ,/print)
t0 = t1
b =0l
dprint,dlevel=2,verbose=verbose,messstr & wait,.01
if obj_valid(progobj) then begin
progobj->update,percent,text=messstr
if progobj->checkcancel() then message,'Download cancelled by user',/ioerror
endif
endif
endwhile
t1 = systime(1)
dt = t1 - tstart
messstr = string(/print,format = "('Downloaded ',f0.3,' MBytes in ',f0.1,' secs @ ',f0.2,' MB/s File: ""', a,'""' )",nb/mb,dt,nb/mb/dt,localname )
dprint,dlevel=2,verbose=verbose,messstr
if obj_valid(progobj) then begin
progobj->update,percent,text=messstr
endif
endelse
free_lun, wunit
if keyword_set(file_mode) then file_chmod,localname,file_mode
if keyword_set(preserve_mtime) then begin
; file_touch,exists=texists
; if texists then begin
;file touch works in local time, but mtime is unix time
file_touch,localname,url_info.mtime,/mtime,/no_create,verbose=verbose ;,toffset=time_zone_offset()
; endif else begin
; dprint,verbose=verbose,dlevel=3 ,'Executable "touch" not found. Could not preserve_mtime'
; endelse
endif
if 0 then begin
file_error2:
dprint,dlevel=0,verbose=verbose,'Error downloading file: "'+url+'"'
error = !error_state.msg
dprint,dlevel=0,verbose=verbose,error
if obj_valid(progobj) then begin
progobj->update,0.,text=error
endif
if keyword_set(wunit) then begin
free_lun, wunit
file_move,localname,localname+'.error'
endif
; dprint,dlevel=0,verbose=verbose,'Deleting: "' + lcl.name +'"'
; file_delete,lcl.name ; This is not desirable!!!
endif
endif else begin
dprint,dlevel=3,verbose=verbose,'Local file: "' + localname + '" is current (Not downloaded)'
endelse
endif else begin
dprint,dlevel=1,verbose=verbose,'Remote file not found! "'+ url + '" (increase VERBOSE to learn more)'
dprint,dlevel=3,verbose=verbose,'If file was expected, you should verify that your anti-virus software did not block the connection and add an exception for IDL, if necessary'
dprint,dlevel=4,verbose=verbose,'Request Had Header: '
dprint,dlevel=4,verbose=verbose, transpose(Header)
; url_info = 0
endelse
close_server:
free_lun, unit
dprint,dlevel=5,verbose=verbose,'Closing server: ',server
final:
if keyword_set(recurse_limit) then begin ; Recursive search for files.
if keyword_set(index) then if index.localname ne localdir+indexfilename then links=''
if not keyword_set(links) then begin ; Get directory list
file_http_copy,'',serverdir=serverdir,localdir=localdir, $
min_age_limit=min_age_limit,verbose=verbose,no_update=no_update, $
file_mode=file_mode,dir_mode=dir_mode,ascii_mode=1 , host=host, $
url_info=index,links=links,user_agent=user_agent,user_pass=user_pass,if_modified_since=if_modified_since ;No need to preserve mtime on dir listings ,preserve_mtime=preserve_mtime
endif
wdir = where(strpos(links,'/',0) gt 0,ndirs) ; Look in each directory for the requested file
for i=0,ndirs-1 do begin
dir = links[wdir[i]]
file_http_copy,pathname,recurse_limit=recurse_limit-1,serverdir=serverdir+dir,localdir=localdir+dir $
, verbose=verbose,file_mode=file_mode,dir_mode=dir_mode,ascii_mode=ascii_mode $
, min_age_limit=min_age_limit, last_version=last_version, url_info=ui $
, no_download=no_download, no_clobber=no_clobber,no_update=no_update, archive_ext=archive_ext,archive_dir=archive_dir $
, ignore_filesize=ignore_filesize,user_agent=user_agent,user_pass=user_pass, host=host $
, preserve_mtime=preserve_mtime,restore_mtime=restore_mtime,if_modified_since=if_modified_since
if not keyword_set(ui) then message,'URL error (this error should never occur)'
w = where(ui.exists ne 0,nw)
if nw ne 0 then url_info = keyword_set(url_info) ? [url_info,ui[w]] : ui[w] ; only include existing files
endfor
endif
; if keyword_set(url_info) then localname=url_info.localname else localname=''
final2:
if keyword_set(url_info_s) and keyword_set(url_info) then $
url_info_s=[url_info_s,url_info] else url_info_s=url_info
endfor
if keyword_set(url_info_s) then localname=url_info_s.localname else localname=''
dprint,dlevel=5,verbose=verbose,'Done'
;if n_elements(verbose) ne 0 then dprint,setdebug=last_dbg ; Reset previous debug level.
return
final_quit:
if keyword_set(url_info_s) and keyword_set(url_info) then $
url_info_s=[url_info_s,url_info] else url_info_s=url_info
dprint,dlevel=1,verbose=verbose,'Abnormal exit. Aborting.'
END