Visualizzazione dei risultati da 1 a 9 su 9
  1. #1

    uno spider/crawler in asp

    ciao, sto cercando uno script in asp che funga da crawler/spider, cioè che vada in giro per la rete a indicizzare siti internet.
    Questo è possibile in asp? oppure ne esistono solo in php?

  2. #2

    Re: uno spider/crawler in asp

    Originariamente inviato da azocomposto
    ciao, sto cercando uno script in asp che funga da crawler/spider, cioè che vada in giro per la rete a indicizzare siti internet.
    Questo è possibile in asp? oppure ne esistono solo in php?
    Io ne ho fatto uno in VBScript su file con estenzione .vbs. Indicizza un intero sito partendo dalla URL di base catalogando pagine HTML e file DOC. Volevo aggiungere i file Excel e PDF, ma mi sono fermato.
    Devo riprenderlo sottomano. Usa l'oggetto XMLHTTP e RegExp e un db con due tabella dove salvare i dati.

  3. #3
    complimenti, mi sembra ottimo, è proprio ciò che cerco
    mi potresti mandare una copia zippata?
    grazie mems

  4. #4
    Utente di HTML.it L'avatar di fazius
    Registrato dal
    Mar 2006
    residenza
    Torino
    Messaggi
    870
    se è possibile mi accodo alla richiesta ciao
    lunga vita e prosperità

  5. #5
    Originariamente inviato da azocomposto
    complimenti, mi sembra ottimo, è proprio ciò che cerco
    mi potresti mandare una copia zippata?
    grazie mems
    Serve un db SQL Server e due tabelle:
    tblURL (pageURL varchar(255), pageIndexed int, pageStatus int)
    tblPages (pageURL varchar(255), pageTitle varchar(255), pageKeyowrds varchar(1000), pageDescription varchar(1000), pageContent text, pageRank int, dateTime varchar(14))
    Questo è il codice del file spider.vbs
    Lo piazzi sul desktop e ci fai doppio click.
    Ovviamente non ti assicuro nulla. Era in fase di sviluppo e mi sono fermato tempo fa.
    In rosso la parte da personalizzare.

  6. #6
    codice:
    '*******************************
    ' COSE DA FARE:
    ' Lettura dai file EXCEL
    ' Lettura dai file PDF
    '*******************************
    
    Option Explicit
    On Error Resume Next
    
    Dim ThePageURL, TheTagPattern, TheArrayEndingPattern, StopSpider, URLStatus, DocExt, ObjWord, ArrayStringsToRemove, TitleToRemove
    Dim StrPageContent, Item, iStartingTagPattern, Cnt, i, j, TheArrayDocumentsExtension, ObjDocContent, TheTextToRemove, wdDoNotSaveChanges
    Dim iEndingTagPattern, TempURL, SiteURL, objRegExpMatch, Element, ValidDocExtToIndex, StrDocumentContent, ArrayFileToRemove, PageRank
    Dim ObjConn, ObjRS, TempObjRS, ObjRSPageRank, StrSQL, StrSQLPageRank, StrSQLInsert, ObjXMLHttp, ObjRegExp, ObjExpMatch, bLinkFound, TheContent, TheURL
    Dim iYear, iMonth, iDay, iHour, iMinutes, iSeconds, DateTime, e, iStartingPos, iEndingPos, ArrayCode, ArrayStrings, SkipURL
    Dim PageTitle, PageKeywords, PageDescription, PageContent, DelimiterFound, TempCode, TempArrayCode, TheText, k, ObjDocProps
    Dim ObjMail,ObjConf, Flds, Smtp, Port, Authenticate, MailUser, MailPass, MailRecipient, MailSender
    
    Function GetError()
    	If Err <> 0 Then
    		Call NotifyErrorByEmail(Err.Description, Err.Line)		
    	End If
    End Function
    
    Function NotifyErrorByEmail(TheErrorDescription, TheErrorLine)
    	Set ObjMail = CreateObject("CDO.Message")
    		Set ObjConf = CreateObject("CDO.Configuration")
    			Set ObjFlds = ObjConf.Fields
    				ObjFlds("http://schemas.microsoft.com/cdo/configuration/sendusing") = 2
    				ObjFlds("http://schemas.microsoft.com/cdo/configuration/smtpserver") = Smtp
    				ObjFlds("http://schemas.microsoft.com/cdo/configuration/smtpserverport") = Port
    				ObjFlds("http://schemas.microsoft.com/cdo/configuration/smtpauthenticate") = Authenticate
    					If Authenticate = 1 Then
    						Flds("http://schemas.microsoft.com/cdo/configuration/sendusername") = MailUser
    						Flds("http://schemas.microsoft.com/cdo/configuration/sendpassword") = MailPass
    					End If				
    				ObjFlds.Update()
    					Set ObjMail.Configuration = ObjConf
    					ObjMail.to = MailRecipient
    					ObjMail.from = MailSender
    					ObjMail.subject = TitleToRemove & "Errore nell'indicizzazione del sito: " & SiteURL
    					ObjMail.textBody = "Sito Web: " & TitleToRemove & SiteURL & vbCrLf &_
    							 					  	 "Descrizione errore: " & TheErrorDescription & vbCrLf &_
    							 					  	 "Riga errore: " & TheErrorLine & vbCrLf &_ 												 
    												  	 "Data/ora: " & formatDateTime(now(), vbGeneralDate) & vbCrLf 					
    					ObjMail.Send()
    			Set ObjFlds = Nothing
    		Set ObjConf = Nothing
    	Set ObjMail = Nothing
    End Function
    
    Function GetLinksInPage(ThePageURL, TheTagPattern, TheArrayEndingPattern, TheArrayDocumentsExtension)
    	bLinkFound = False
    	ObjXMLHttp.Open "GET", ThePageURL, False
    	ObjXMLHttp.Send
    	Call GetError()
    	URLStatus = CInt(ObjXMLHttp.Status)		
    		If URLStatus = 200 Then
    			If ObjXMLHttp.ReadyState = 4 Then			
    				StrPageContent = ObjXMLHttp.ResponseText			
    				With ObjRegExp
    					.Pattern = TheTagPattern
    					.IgnoreCase = True
    					.Global = True
    				End With
    					Set ObjRegExpMatch = ObjRegExp.Execute(StrPageContent)
    						If ObjRegExpMatch.Count > 0 Then						
    							For Each Item In ObjRegExpMatch						
    								iStartingTagPattern = (Item.FirstIndex + Len(TheTagPattern) + 1)							
    								Cnt = 1
    									For i = iStartingTagPattern To Len(StrPageContent)
    									bLinkFound = False
    										For j = 0 To UBound(TheArrayEndingPattern)									
    											If LCase(Mid(StrPageContent,i,Len(TheArrayEndingPattern(j)))) = LCase(TheArrayEndingPattern(j)) Then
    												iEndingTagPattern = Cnt - 1			
    												bLinkFound = True								
    												Exit For
    											End If
    										Next
    										If bLinkFound Then Exit For
    										Cnt = Cnt + 1									
    									Next								
    									TempURL = Mid(StrPageContent,iStartingTagPattern,iEndingTagPattern)										
    									TempURL = Replace(TempURL,"""","")
    										If InStrRev(TempURL,"#") > 0 Then
    											TempURL = Left(TempURL,InStrRev(TempURL,"#") -1)
    										End If
    									TempURL = CheckDomainURL(TempURL)
    										If Not SkipURL Then
    											For Each Element In TheArrayDocumentsExtension
    												If Right(TempURL,Len(Element)) = Element Then
    												Call StoreLink(TempURL,URLStatus)	
    												End If												
    											Next
    										End If												
    							Next
    						End If
    					Set ObjRegExpMatch = Nothing
    			End If		
    		End If
    End Function
    
    Function BuildPageRank()
    	StrSQL = "SELECT PageURL FROM tblPages"
    	ObjRS.Open StrSQL, ObjConn, 3, 3
    		Call GetError()
    		If Not ObjRS.EOF Then
    			Do Until ObjRS.EOF
    				PageRank = 0
    				StrSQLPageRank = "SELECT PageURL FROM tblPages WHERE PageURL != '" & Replace(ObjRS("PageURL"),"'","''") & "'"
    				Set ObjRSPageRank = ObjConn.Execute(StrSQLPageRank)
    					Call GetError()
    					If Not ObjRSPageRank.EOF Then						
    						Do Until ObjRSPageRank.EOF							
    							TempURL = ObjRSPageRank("PageURL")
    								If Right(TempURL,5) = ".html" Then
    									ObjXMLHttp.Open "GET", SiteURL & TempURL, False
    									ObjXMLHttp.Send
    									Call GetError()
    									URLStatus = CInt(ObjXMLHttp.Status)	
    										If URLStatus = 200 Then
    											If ObjXMLHttp.ReadyState = 4 Then			
    												StrPageContent = ObjXMLHttp.ResponseText			
    												With ObjRegExp
    													.Pattern = ObjRS("PageURL")
    													.IgnoreCase = True
    													.Global = True
    												End With
    												Set ObjRegExpMatch = ObjRegExp.Execute(StrPageContent)																								
    													PageRank = PageRank + ObjRegExpMatch.Count
    												Set ObjRegExpMatch = Nothing												
    											End If
    										End If
    									End If
    								ObjRSPageRank.MoveNext
    							Loop
    							ObjConn.Execute("UPDATE tblPages SET PageRank = " & PageRank & " WHERE PageURL = '" & Replace(ObjRS("PageURL"),"'","''") & "'")												
    							Call GetError()
    						End If
    					ObjRSPageRank.Close
    					Set ObjRSPageRank = Nothing
    				ObjRS.MoveNext
    			Loop
    		End If
    	ObjRS.Close
    End Function
    
    Function LocateCodeInPage(ThePageURL, TheTagProperty, TheTagDelimiter, ByRef TheCode)
    DelimiterFound = False
    	ObjXMLHttp.Open "GET", ThePageURL, False
    			ObjXMLHttp.Send		
    				Call GetError()
    				If ObjXMLHttp.ReadyState = 4 Then 
    					StrPageContent = ObjXMLHttp.ResponseText
    						With ObjRegExp
    							.Pattern = TheTagProperty
    							.IgnoreCase = True
    							.Global = True
    						End With
    							Set ObjExpMatch = ObjRegExp.Execute(StrPageContent)
    								If ObjExpMatch.Count > 0 Then
    									For Each Item In ObjExpMatch
    										iStartingPos = (Item.FirstIndex + Len(TheTagProperty) + 1)
    										Cnt = 1
    										DelimiterFound = False
    											For i = iStartingPos to Len(StrPageContent)
    												For j = 0 to UBound(TheTagDelimiter)															
    													If LCase(Mid(StrPageContent,i,Len(TheTagDelimiter(j)))) = LCase(TheTagDelimiter(j)) Then													
    														iEndingPos = Cnt - 1
    														DelimiterFound = True
    														Exit For														
    													End If 													
    												Next
    												If DelimiterFound Then Exit For
    												Cnt = Cnt + 1
    											Next
    										TempCode = Mid(StrPageContent,iStartingPos,iEndingPos)										
    										TheCode = TempCode											
    									Next	
    								End If	
    					End If
    		ArrayCode = TempArrayCode																					
    End Function
    
    Function GetWordDocumentContent(TheDocument)
    Set ObjWord = CreateObject("Word.Application")
    Call GetError()
    ObjWord.Documents.Open TheDocument
    	Call GetError()
    	Set ObjDocContent = ObjWord.ActiveDocument
    		Call GetError()
    		StrDocumentContent = ObjDocContent.Content.Text
    		StrDocumentContent = Replace(StrDocumentContent,vbCrLf," ")
    		PageContent = StrDocumentContent
    	Set ObjDocContent = Nothing
    	Set ObjDocProps = ObjWord.ActiveDocument.BuiltInDocumentProperties		
    		PageTitle = ObjDocProps("Title").Value		
    		PageKeywords = ObjDocProps("keywords").Value
    		PageDescription = ObjDocProps("Comments").Value	
    		Call GetError()
    	Set ObjDocProps = Nothing
    ObjWord.ActiveDocument.Close wdDoNotSaveChanges
    ObjWord.Quit
    Set ObjWord = Nothing
    End Function
    
    Function StoreLink(TheLink, TheStatus)
    	Set TempObjRS = ObjConn.Execute("SELECT PageURL FROM tblURL WHERE PageURL = '" & Replace(TheLink,"'","''") & "'")
    	Call GetError()
    	If TempObjRS.EOF Then
    		ObjConn.Execute("INSERT INTO tblURL (PageURL, PageIndexed, PageStatus) VALUES ('" & Replace(TheLink,"'","''") & "', 0, " & TheStatus & ")")	
    		Call GetError()
    	End If
    	TempObjRS.Close
    	Set TempObjRS = Nothing
    End Function
    
    Function IndexPage(TheArrayValidDocExtToIndex)
    	StrSQL = "SELECT DISTINCT(PageURL) FROM tblURL WHERE PageIndexed = 0 AND PageStatus = 200 GROUP BY PageURL"	
    	ObjRS.Open StrSQL, ObjConn, 3, 3
    		Call GetError()
    		If Not ObjRS.EOF Then
    			Do Until ObjRS.EOF
    				For e = 0 To UBound(TheArrayValidDocExtToIndex)
    					If Right(ObjRS("PageURL"),Len(TheArrayValidDocExtToIndex(e))) = TheArrayValidDocExtToIndex(e) Then
    						Call getLinksInPage(SiteURL & ObjRS("PageURL"), "href=", Array(" ",">"), DocExt)
    					End If
    				Next
    				ObjConn.Execute("UPDATE tblURL SET PageIndexed = 1 WHERE PageURL = '" & Replace(ObjRS("PageURL"),"'","''") & "'")
    				Call GetError()
    			ObjRS.MoveNext
    			Loop
    		Else
    			StopSpider = True
    			ObjRS.Close
    			Exit Function
    		End If
    	ObjRS.Close	
    End Function

  7. #7
    codice:
    Function StorePageData()
    	StrSQL = "SELECT A.PageURL AS P1, " &_
    					 "B.PageURL AS P2 " &_
    					 "FROM tblURL A " &_
    					 "LEFT JOIN tblPages B " &_
    					 "ON B.PageURL = A.PageURL"
    	ObjRS.Open StrSQL, ObjConn, 1, 3
    		Call GetError()
    		If Not ObjRS.EOF Then
    			Do Until ObjRs.EOF
    				If Right(ObjRS("P1"),5) = ".html" Then
    					Call LocateCodeInPage(SiteURL & ObjRS("P1"), "<title>", Array("</title>"), PageTitle)
    					PageTitle = Replace(PageTitle,"""","")
    						If Left(PageTitle,Len(TitleToRemove)) = TitleToRemove Then
    							PageTitle = Mid(PageTitle,(Len(TitleToRemove) + 1))
    						End If
    					Call LocateCodeInPage(SiteURL & ObjRS("P1"), "name=""keywords"" content=", Array("/>",">"), PageKeywords)
    					PageKeywords = Replace(PageKeywords,"""","")
    					Call LocateCodeInPage(SiteURL & ObjRS("P1"), "name=""description"" content=", Array("/>",">"), PageDescription)
    					PageDescription = Replace(PageDescription,"""","")
    					Call LocateCodeInPage(SiteURL & ObjRS("P1"), "</head>", Array("</body>"), PageContent)	
    					PageContent = StringsToRemove(PageContent)									
    					PageContent = Replace(PageContent,"  ","")
    					PageContent = Replace(PageContent,vbCrLf," ")
    					PageContent = Trim(PageContent)
    					PageContent = RemoveHTML(PageContent)					
    				ElseIf Right(ObjRS("P1"),4) = ".doc" Then					
    					Call GetWordDocumentContent(SiteURL & ObjRS("P1"))	
    						If Left(PageTitle,17) = "Microsoft Word - " Then
    							PageTitle = Replace(PageTitle,"Microsoft Word - ","")
    						End If
    						If Right(PageTitle,4) = ".doc" Then
    							PageTitle = Left(PageTitle,(Len(PageTitle) - 4))
    						End If				
    				Else
    					PageTitle = ""
    					PageKeywords = ""
    					PageDescription = ""
    					PageContent = ""
    				End If				
    				If IsNull(ObjRS("P2")) Or Len(ObjRS("P2")) = 0 Then
    					StrSQLInsert = "INSERT INTO tblPages (" &_
    									 			 "PageURL, " &_
    									 			 "PageTitle, " &_
    									 			 "PageKeywords, " &_
    									 			 "PageDescription, " &_
    									 			 "PageContent, " &_
    									 			 "PageRank, " &_
    												 "DateTime) " &_
    												 "VALUES (" &_
    												 "'" & Replace(ObjRS("P1"),"'","''") & "', " &_
    												 "'" & Replace(PageTitle,"'","''") & "', " &_
    												 "'" & Replace(PageKeywords,"'","''") & "', " &_
    												 "'" & Replace(PageDescription,"'","''") & "', " &_
    												 "'" & Replace(PageContent,"'","''") & "', " &_
    												 "0, " &_
    												 "'" & DateTime & "')"
    				Else
    					StrSQLInsert = "UPDATE tblPages SET " &_
    												 "PageTitle = '" & Replace(PageTitle,"'","''") & "', " &_
    												 "PageKeywords = '" & Replace(PageKeywords,"'","''") & "', " &_
    												 "PageDescription = '" & Replace(PageDescription,"'","''") & "', " &_
    												 "PageContent = '" & Replace(PageContent,"'","''") & "', " &_
    												 "PageRank = 0, " &_
    												 "DateTime = '" & DateTime & "' " &_
    												 "WHERE PageURL = '" & Replace(ObjRS("P1"),"'","''") & "'"
    				End If
    				ObjConn.Execute(StrSQLInsert)
    				Call GetError()
    			ObjRS.MoveNext
    			Loop		
    		End If		
    	ObjRS.Close	
    	Call DeleteOldReferences()	
    	Call BuildPageRank()	
    End Function
    
    Function DeleteOldReferences()
    	ObjConn.Execute("DELETE FROM tblPages WHERE DateTime <> '" & DateTime & "'")
    	ObjConn.Execute("DELETE FROM tblPages WHERE LEFT(PageURL,1) != '/'")
    	ObjConn.Execute("DELETE FROM tblURL WHERE LEFT(PageURL,1) != '/'")
    	Call GetError()
    End Function
    
    Function RemoveHTML(TheContent)
    	With ObjRegExp
    		.Pattern = "<[^>]*>"
    		.IgnoreCase = True
    		.Global = True	
    	End With
    	TheContent = Replace(TheContent,"
    "," ")	
    	TheContent = Replace(TheContent,"</p>"," ")
    	RemoveHTML = ObjRegExp.Replace(TheContent, "")
    End Function
    
    Function GetTextFromPage(ThePageURL)
    	ObjXMLHttp.Open "GET", SiteURL & ThePageURL, False
    	ObjXMLHttp.Send
    	Call GetError()
    	URLStatus = CInt(ObjXMLHttp.Status)	
    		If URLStatus = 200 Then
    			If ObjXMLHttp.ReadyState = 4 Then			
    				GetTextFromPage = ObjXMLHttp.ResponseText
    			End If
    		Else
    			Exit Function	
    		End If
    End Function
    
    Function CheckDomainURL(TheURL)
    SkipURL = False
    	If Left(LCase(TheURL),7) = "http://" Then
    		If Left(LCase(TheURL),Len(SiteURL)) = LCase(SiteURL) Then
    			TheURL = Mid(TheURL,(Len(SiteURL) + 1))
    		ElseIf Not Left(TheURL,1) = "/" Then
    			SkipUrl = True
    		Else
    			SkipURL = True
    		End If		
    	End If
    	CheckDomainURL = TheURL
    End Function
    
    Function StringsToRemove(TheText)	
    		For k = 0 To UBound(ArrayFileToRemove)
    			TheTextToRemove = GetTextFromPage(ArrayFileToRemove(k))
    			TheText = Replace(TheText,TheTextToRemove,"")
    		Next
    		For k = 0 To UBound(ArrayStringsToRemove)			
    			TheText = Replace(TheText,ArrayStringsToRemove(k),"")
    		Next
    		StringsToRemove = TheText
    End Function
    
    Function InitializeCatalog()
    	ObjConn.Execute("DELETE FROM tblURL")
    	Call GetError()
    End Function
    
    Function LoopFunctions()
    	While Not StopSpider
    		Call IndexPage(ValidDocExtToIndex)
    	Wend
    	If StopSpider Then
    		Call StorePageData()
    	End If
    End Function
    
    iYear = DatePart("yyyy",Date())
    iMonth = DatePart("m",Date())
    If iMonth < 10 Then iMonth = "0" & iMonth
    iDay = DatePart("d",Date())
    If iDay < 10 Then iDay = "0" & iDay
    iHour = DatePart("h",Time())
    If iHour < 10 Then iHour = "0" & iHour
    iMinutes = DatePart("n",Time())
    If iMinutes < 10 Then iMinutes = "0" & iMinutes
    iSeconds = DatePart("s",Time())
    If iSeconds < 10 Then iSeconds = "0" & iSeconds
    DateTime = iYear & iMonth & iDay & iHour & iMinutes & iSeconds
    
    Set ObjConn = CreateObject("ADODB.Connection")
    	  ObjConn.Open "Driver={SQL Server};Server=localhost;Database=Search;Uid=utente;Pwd=password;"
    	  Call GetError()
    Set ObjRS = CreateObject("ADODB.Recordset")
    Set ObjXMLHttp = CreateObject("Msxml2.XMLHTTP.4.0")
    		Call GetError()
    Set ObjRegExp = new RegExp
    	
    	SiteURL = "http://www.nomedelsito.it"
    	TitleToRemove = "Blocco statico titolo pagina - "
    	DocExt = Array(".html",".doc")
    	ValidDocExtToIndex = Array(".html")
    	ArrayStringsToRemove = Array("Frase comune in tutte le pagine 1","Frase comune in tutte le pagine 2")
    	ReDim ArrayFileToRemove(2)
    	ArrayFileToRemove(0) = "/include/menu.html"
    	ArrayFileToRemove(1) = "/include/testata.html"
    	ArrayFileToRemove(2) = "/include/footer.html"
    	Smtp = "smtp.nomesito.it"
    	Port = "25"
    	Authenticate = 0 ' 1 = con autenticazione; 0 = senza autenticazione
    	MailUser = ""
    	MailPass = ""
    	MailSender = "mittente@nomesito.it"
    	MailRecipient = "destinatario@nomesito.it"
    	
    		Call InitializeCatalog()
    		Call getLinksInPage(SiteURL, "href=", Array(" ",">"), DocExt)
    		Call LoopFunctions()						
    
    Set ObjRegExp = Nothing
    Set ObjXMLHttp = Nothing
    Set ObjRS = Nothing
    	  ObjConn.Close
    Set ObjConn = Nothing
    
    MsgBox "End Of Script"

  8. #8
    mems, mi hai scritto bue blocchi di codice, potresti gentilmente illustrarmeli e spiegarmi come farmi il database sql? questo tipo di db non l'ho mai usato...

    ciao
    grazie

  9. #9
    I due blocchi di codice fanno parte in un unico file.
    Non è una pagina ASP, ma un file .vbs quindi niente tag <% e %>
    Salvi il tutto in un file testuale e lo rinomini in .vbs e ci clicchi sopra.
    Il db SQL Server lo crei con Enterprise Manager di SQL Server.
    Per il resto, se conosci ASP, non dovresti avere difficoltà.

Permessi di invio

  • Non puoi inserire discussioni
  • Non puoi inserire repliche
  • Non puoi inserire allegati
  • Non puoi modificare i tuoi messaggi
  •  
Powered by vBulletin® Version 4.2.1
Copyright © 2025 vBulletin Solutions, Inc. All rights reserved.