;ndups+i ; search for dup names: ; ; 1. input lists/proj.allfixed ; 2. sort ; 3. get basenames ; 4. flag basesnames that are duplicate ; 5. for each common basename, check if the file size is the same ; - exclude any that differ ; 6. for each basename duplicate with the same size: ; - compare the first 30Kbytes ; - fast for large files that differ in the first 30k bytes ; - exclude any that differ ; 7. - redo comparing the entire files ; - exclude any that differ ; 8. return list of duplicate files ; 6. flag any files that have the same realname ; ; struct holding dup info ; goto,doit maxdups=3 ; only check up to 3 dups dupI={bname:"",$ nduporig :0,$ ; how many found in ndupchk : 0,$ ; number found after checking .. but only check max of 3 ind : 0l,$; index into fnmars for first file of set. others are adj fsize: 0ul,$ ; bytes flist:strarr(maxdups)}; full path with name finp="./lists/proj.allfixed" nfiles=readasciifile(finp,fnmAr) bnmAr=file_basename(fnmAr) iis=sort(bnmar) bnmarS=bnmar[iis] fnmarS=fnmar[iis] ; ; look for dup basenames ; cntArS=intarr(nfiles) i0=0l for i=0L,nfiles-2 do begin &$ if (bnmArs[i] eq bnmarS[i0]) then begin &$ cntArS[i0]++ &$ endif else begin &$ cntArS[i]=1 &$ i0=i &$ endelse &$ endfor ; ; do the last name separately ; if (bnmArS[nfiles-2] ne bnmarS[nfiles-1]) then cntArS[nfiles-1]=1 ; ; throw out files not part of dups ; ii=where(cntArS ne 1,cntsne1) fnmars=fnmarS[ii] cntArS=cntArs[ii] iisgt1=where((cntArS gt 1),cntsgt1) print,"cntsgt1:",cntsgt1 print,"start file_infos" fi1s=file_info(fnmars[iisgt1]) print," done file_info1" fi2s=file_info(fnmars[iisgt1 +1]) print," done file_info2" ; ; make sure 2 sizes are the same, if not exclude.. just a common name ; ii=where(fi1s.size eq fi2s.size,cnt) if cnt lt cntsgt1 then begin iisgt1=iisgt1[ii] cntsgt1=cnt fi1s=fi1s[ii] fi2s=fi2s[ii] endif print,"After size check: cntsgt1:",cntsgt1 ; ; takes forever compare large files 60sec 2gb ; 1. size the same ; 2. 1st 30k the same ; 3. last 30k the same ; assume equal:w ; ; ;stop doit: dupIAr=replicate(dupI,cntsgt1) icur=0l nbytes=30000l ; to check front,rear cmp1=string(format='("cmp -n ",i9," ")',nbytes) for i=0L,cntsgt1-1 do begin &$ fsize=fi1s[i].size iis1=iisgt1[i] ; index into fnmarS stat1=-1 stat2=-1 stat3=-1 stat4=-1 match=0 toskip=fsize - nbytes ; don't bother if small doend=toskip gt nbytes if doend then cmp2=string(format='("cmp -i ",i10," ")',toskip) ; fnmpair=fnmars[iis1] + " " + fnmars[iis1+1] &$ cmd1=cmp1 + fnmpair &$ if doend then cmd2=cmp2 + fnmpair &$ spawn,cmd1,result,err_result,exit_status=stat1 &$ if (stat1 eq 0 ) then begin if doend then begin spawn,cmd2,result,err_result,exit_status=stat2 &$ endif else begin stat2=0 endelse if stat2 eq 0 then begin dupIar[icur].bname=file_basename(fi1s[i].name) dupIar[icur].nduporig=cntArS[iis1] dupIar[icur].ndupchk=2 dupIar[icur].ind=iis1 dupIar[icur].fsize=fi1s[i].size dupIar[icur].flist[0]=fi1s[i].name dupIar[icur].flist[1]=fi2s[i].name match=1 ; if same check here.. if (cntArS[iis1] gt 2) then begin fnm3=fnmars[iis1+2] fi3=file_info(fnm3) if (fi3.size eq fi1s[i].size) then begin cmd3=cmp1 + fi1s[i].name + " " + fnm3 &$ if doend then cmd4=cmp2 + fi1s[i].name + " " + fnm3 &$ spawn,cmd3,result,err_result,exit_status=stat3 &$ if stat3 eq 0 then begin if doend then begin spawn,cmd4,result,err_result,exit_status=stat4 &$ endif else begin stat4=0 endelse endif if stat4 eq 0 then begin dupIar[icur].ndupchk++ dupIar[icur].flist[2]=fnm3 endif endif endif endif endif stat1=fix(stat1) stat2=fix(stat2) stat3=fix(stat3) stat4=fix(stat4) print,i,icur,stat1,stat2,stat3,stat4,fi1s[i].size if match gt 0 then icur++ endfor ; stop ii=where(dupiar.ndupchk gt 0,cnt) if cnt lt cntsgt1 then begin iisgt1=iisgt1[ii] cntsgt1=cnt dupIar=dupiar[ii] endif lastRun=systime() save,lastrun,dupiar,fnmars,file="dupi.sav" end