Hi Powercoder
Ich habe hier ein kleines crawler script. Das aus einer Textdatei mit urls jede einzelne url besucht und checkt ob es neue links auf der jeweiligen url gibt. Das Skript muss ein Textfile checken ob es duplicate links gibt - das file ist im Moment 20 MB und jeden tag wirds grösser -> je grösser das textfile desto langsamer wird das ganze. hat jemand eine idee um das schneller zu machen?
Spoiler anzeigen
#Region ;**** Directives created by AutoIt3Wrapper_GUI ****
#AutoIt3Wrapper_UseX64=n
#AutoIt3Wrapper_Res_Fileversion=0.0.0.17
#AutoIt3Wrapper_Res_Fileversion_AutoIncrement=y
#AutoIt3Wrapper_Add_Constants=n
#AutoIt3Wrapper_AU3Check_Stop_OnWarning=y
#EndRegion ;**** Directives created by AutoIt3Wrapper_GUI ****
#Include <File.au3>
#include <IE.au3>
#include <Array.au3>
#include <string.au3>
;Options
Opt("WinTitleMatchMode", 2) ;1=start, 2=subStr, 3=exact, 4=advanced, -1 to -4=Nocase
Opt("WinSearchChildren", 1) ;0=no, 1=search children also
Opt("WinTextMatchMode", 1) ;1=complete, 2=quick
Opt("WinWaitDelay", 250)
Opt("WinDetectHiddenText", 1) ;1=on
Opt("MouseCoordMode", 0)
$testurl = "www.google.de"
$sites = FileOpen("rlinksites.txt", 0)
$savepath=("rlinkurls.txt")
$temppath=("True")
$log = ("old_links_log.txt")
FileDelete($savepath)
FileDelete($temppath)
Local $hTimer = TimerInit() ; Begin the timer and store the handle in a variable.
[/autoit] [autoit][/autoit] [autoit]Sleep(5000)
[/autoit] [autoit][/autoit] [autoit]; Check if file opened for reading OK
If $sites = -1 Then
;MsgBox(0, "Error", "Unable to open file.")
Exit
EndIf
; Read in lines of text until the EOF is reached
While 1
; Timer to know how long it takes to get all sites done
;$AnfangsTimer = TimerInit()
; -------------------------------------------------------------------------
; Test if Internet available
Do
MsgBox(4096, "Testing...", "...Internet Connection", 2)
$testsite = 0
$testsite = Ping($testurl)
$e = 0
While $testsite = 0
If $e >= 10 Then Exit
$testsite = Ping($testurl)
MsgBox(4096, "No...", "...Connection - Try again",2)
$e = $e + 1
MsgBox("",$e,"",1)
Sleep(1000)
WEnd
Until $testsite > 0
MsgBox("","Internet is...","...Working",2)
$e = 0
; -------------------------------------------------------------------------
$line = FileReadLine($sites)
If @error = -1 Then ExitLoop
If $line = "" Then ContinueLoop ;skip blank lines
$line = _IECreate($line)
_IEErrorHandlerRegister ("MyErrFunc")
If @error = -1 Then ContinueLoop
Sleep(500)
WinWait("Windows Internet Explorer","")
WinSetState("Windows Internet Explorer", "", @SW_MAXIMIZE)
If Not WinActive("Windows Internet Explorer","") Then WinActivate("Windows Internet Explorer","")
WinWaitActive("Windows Internet Explorer","")
Sleep(5000)
If @error = -1 Then ContinueLoop
; $line = FileReadLine($sites)
; If @error = -1 Then ExitLoop
; If $line = "" Then ContinueLoop ;skip blank lines
$oLinks = _IELinkGetCollection ($line)
$iNumLinks = @extended
;MsgBox(0, "Link Info", $iNumLinks & " links found")
If @error = -1 Then ContinueLoop
Dim $arLinks[1] = [$iNumLinks]
$oLinks =_IELinkGetCollection($line)
$iNumLinks = @extended
For $oLink in $oLinks
$fileContent = FileRead($log)
If NOT StringRegExp($fileContent, $olink.href) Then
_ArrayAdd($arLinks, $oLink.href)
; Else
;; Do what you want with the link here
EndIf
Next
_IEErrorHandlerRegister ("MyErrFunc")
If @error = -1 Then ContinueLoop
;_ArrayDisplay($arLinks,"Links")
;if @error Then Msgbox(0,"", @error)
;If FileExists($savepath) Then
; FileDelete($savepath)
; EndIf
;_FileCreate($savepath)
;_FileWriteToLine($savepath, 1, $arLinks[0], 0)
[/autoit] [autoit][/autoit] [autoit];_FileWriteFromArray($writeurls, $arLinks, 1)
[/autoit] [autoit][/autoit] [autoit]$writeurls = FileOpen($savepath=("rlinkurls.txt"),1)
_FileWriteFromArray($writeurls, $arLinks, 1)
FileClose($writeurls)
_IEQuit($line)
If @error = -1 Then ContinueLoop
; NEW
Send("{ENTER}")
ProcessClose("iexplore.exe")
Sleep(1000)
ProcessClose("iexplore.exe")
Sleep(1000)
ProcessClose("iexplore.exe")
Sleep(1000)
ProcessClose("wmplayer.exe")
Sleep(1000)
ProcessClose("msinm.exe")
Wend
; REMOVE DUPLICATE LINKS
[/autoit] [autoit][/autoit] [autoit]Dim $oFile,$nFile
[/autoit] [autoit][/autoit] [autoit]_FileReadToArray($temppath,$oFile)
$nFile = _ArrayUnique($oFile)
_FileWriteFromArray($savepath,$nFile)
; Wire "old" URLS in log so they will be no more "double or more checking of same links"
$savepath=("rlinkurls.txt")
$log=("old_links_log.txt")
Dim $fileAll
Dim $lastFile
$lastFile = 1 ;This is the number that is on the last file. Ex.
;You have work1.txt thru work11.txt then $lastFile
;should be 11.
$fileAll = FileOpen($log, 1) ;Write mode = Append to end of file.
[/autoit] [autoit][/autoit] [autoit]For $i = 1 To $lastFile Step 1
FileWrite($fileAll, FileRead($savepath))
Next
FileClose($fileAll)
[/autoit] [autoit][/autoit] [autoit]; NEW
Send("{ENTER}")
ProcessClose("iexplore.exe")
Sleep(1000)
ProcessClose("iexplore.exe")
Sleep(1000)
ProcessClose("iexplore.exe")
Sleep(1000)
ProcessClose("wmplayer.exe")
Sleep(1000)
ProcessClose("msinm.exe")
Local $iDiff = TimerDiff($hTimer) ; Find the difference in time from the previous call of TimerInit. The variable we stored the TimerInit handlem is passed as the "handle" to TimerDiff.
[/autoit] [autoit][/autoit] [autoit]MsgBox(4096,"Execution Time",$iDiff)
[/autoit] [autoit][/autoit] [autoit]Func MyErrFunc()
; Important: the error object variable MUST be named $oIEErrorHandler
$ErrorScriptline = $oIEErrorHandler.scriptline
$ErrorNumber = $oIEErrorHandler.number
$ErrorNumberHex = Hex($oIEErrorHandler.number,
$ErrorDescription = StringStripWS($oIEErrorHandler.description, 2)
$ErrorWinDescription = StringStripWS($oIEErrorHandler.WinDescription, 2)
$ErrorSource = $oIEErrorHandler.Source
$ErrorHelpFile = $oIEErrorHandler.HelpFile
$ErrorHelpContext = $oIEErrorHandler.HelpContext
$ErrorLastDllError = $oIEErrorHandler.LastDllError
$ErrorOutput = ""
$ErrorOutput &= "--> COM Error Encountered in " & @ScriptName & @CR
$ErrorOutput &= "----> $ErrorScriptline = " & $ErrorScriptline & @CR
$ErrorOutput &= "----> $ErrorNumberHex = " & $ErrorNumberHex & @CR
$ErrorOutput &= "----> $ErrorNumber = " & $ErrorNumber & @CR
$ErrorOutput &= "----> $ErrorWinDescription = " & $ErrorWinDescription & @CR
$ErrorOutput &= "----> $ErrorDescription = " & $ErrorDescription & @CR
$ErrorOutput &= "----> $ErrorSource = " & $ErrorSource & @CR
$ErrorOutput &= "----> $ErrorHelpFile = " & $ErrorHelpFile & @CR
$ErrorOutput &= "----> $ErrorHelpContext = " & $ErrorHelpContext & @CR
$ErrorOutput &= "----> $ErrorLastDllError = " & $ErrorLastDllError
;MsgBox(0,"COM Error", $ErrorOutput)
Return
EndFunc ;==>MyErrFunc