codice:
'*******************************
' COSE DA FARE:
' Lettura dai file EXCEL
' Lettura dai file PDF
'*******************************

Option Explicit
On Error Resume Next

Dim ThePageURL, TheTagPattern, TheArrayEndingPattern, StopSpider, URLStatus, DocExt, ObjWord, ArrayStringsToRemove, TitleToRemove
Dim StrPageContent, Item, iStartingTagPattern, Cnt, i, j, TheArrayDocumentsExtension, ObjDocContent, TheTextToRemove, wdDoNotSaveChanges
Dim iEndingTagPattern, TempURL, SiteURL, objRegExpMatch, Element, ValidDocExtToIndex, StrDocumentContent, ArrayFileToRemove, PageRank
Dim ObjConn, ObjRS, TempObjRS, ObjRSPageRank, StrSQL, StrSQLPageRank, StrSQLInsert, ObjXMLHttp, ObjRegExp, ObjExpMatch, bLinkFound, TheContent, TheURL
Dim iYear, iMonth, iDay, iHour, iMinutes, iSeconds, DateTime, e, iStartingPos, iEndingPos, ArrayCode, ArrayStrings, SkipURL
Dim PageTitle, PageKeywords, PageDescription, PageContent, DelimiterFound, TempCode, TempArrayCode, TheText, k, ObjDocProps
Dim ObjMail,ObjConf, Flds, Smtp, Port, Authenticate, MailUser, MailPass, MailRecipient, MailSender

Function GetError()
	If Err <> 0 Then
		Call NotifyErrorByEmail(Err.Description, Err.Line)		
	End If
End Function

Function NotifyErrorByEmail(TheErrorDescription, TheErrorLine)
	Set ObjMail = CreateObject("CDO.Message")
		Set ObjConf = CreateObject("CDO.Configuration")
			Set ObjFlds = ObjConf.Fields
				ObjFlds("http://schemas.microsoft.com/cdo/configuration/sendusing") = 2
				ObjFlds("http://schemas.microsoft.com/cdo/configuration/smtpserver") = Smtp
				ObjFlds("http://schemas.microsoft.com/cdo/configuration/smtpserverport") = Port
				ObjFlds("http://schemas.microsoft.com/cdo/configuration/smtpauthenticate") = Authenticate
					If Authenticate = 1 Then
						Flds("http://schemas.microsoft.com/cdo/configuration/sendusername") = MailUser
						Flds("http://schemas.microsoft.com/cdo/configuration/sendpassword") = MailPass
					End If				
				ObjFlds.Update()
					Set ObjMail.Configuration = ObjConf
					ObjMail.to = MailRecipient
					ObjMail.from = MailSender
					ObjMail.subject = TitleToRemove & "Errore nell'indicizzazione del sito: " & SiteURL
					ObjMail.textBody = "Sito Web: " & TitleToRemove & SiteURL & vbCrLf &_
							 					  	 "Descrizione errore: " & TheErrorDescription & vbCrLf &_
							 					  	 "Riga errore: " & TheErrorLine & vbCrLf &_ 												 
												  	 "Data/ora: " & formatDateTime(now(), vbGeneralDate) & vbCrLf 					
					ObjMail.Send()
			Set ObjFlds = Nothing
		Set ObjConf = Nothing
	Set ObjMail = Nothing
End Function

Function GetLinksInPage(ThePageURL, TheTagPattern, TheArrayEndingPattern, TheArrayDocumentsExtension)
	bLinkFound = False
	ObjXMLHttp.Open "GET", ThePageURL, False
	ObjXMLHttp.Send
	Call GetError()
	URLStatus = CInt(ObjXMLHttp.Status)		
		If URLStatus = 200 Then
			If ObjXMLHttp.ReadyState = 4 Then			
				StrPageContent = ObjXMLHttp.ResponseText			
				With ObjRegExp
					.Pattern = TheTagPattern
					.IgnoreCase = True
					.Global = True
				End With
					Set ObjRegExpMatch = ObjRegExp.Execute(StrPageContent)
						If ObjRegExpMatch.Count > 0 Then						
							For Each Item In ObjRegExpMatch						
								iStartingTagPattern = (Item.FirstIndex + Len(TheTagPattern) + 1)							
								Cnt = 1
									For i = iStartingTagPattern To Len(StrPageContent)
									bLinkFound = False
										For j = 0 To UBound(TheArrayEndingPattern)									
											If LCase(Mid(StrPageContent,i,Len(TheArrayEndingPattern(j)))) = LCase(TheArrayEndingPattern(j)) Then
												iEndingTagPattern = Cnt - 1			
												bLinkFound = True								
												Exit For
											End If
										Next
										If bLinkFound Then Exit For
										Cnt = Cnt + 1									
									Next								
									TempURL = Mid(StrPageContent,iStartingTagPattern,iEndingTagPattern)										
									TempURL = Replace(TempURL,"""","")
										If InStrRev(TempURL,"#") > 0 Then
											TempURL = Left(TempURL,InStrRev(TempURL,"#") -1)
										End If
									TempURL = CheckDomainURL(TempURL)
										If Not SkipURL Then
											For Each Element In TheArrayDocumentsExtension
												If Right(TempURL,Len(Element)) = Element Then
												Call StoreLink(TempURL,URLStatus)	
												End If												
											Next
										End If												
							Next
						End If
					Set ObjRegExpMatch = Nothing
			End If		
		End If
End Function

Function BuildPageRank()
	StrSQL = "SELECT PageURL FROM tblPages"
	ObjRS.Open StrSQL, ObjConn, 3, 3
		Call GetError()
		If Not ObjRS.EOF Then
			Do Until ObjRS.EOF
				PageRank = 0
				StrSQLPageRank = "SELECT PageURL FROM tblPages WHERE PageURL != '" & Replace(ObjRS("PageURL"),"'","''") & "'"
				Set ObjRSPageRank = ObjConn.Execute(StrSQLPageRank)
					Call GetError()
					If Not ObjRSPageRank.EOF Then						
						Do Until ObjRSPageRank.EOF							
							TempURL = ObjRSPageRank("PageURL")
								If Right(TempURL,5) = ".html" Then
									ObjXMLHttp.Open "GET", SiteURL & TempURL, False
									ObjXMLHttp.Send
									Call GetError()
									URLStatus = CInt(ObjXMLHttp.Status)	
										If URLStatus = 200 Then
											If ObjXMLHttp.ReadyState = 4 Then			
												StrPageContent = ObjXMLHttp.ResponseText			
												With ObjRegExp
													.Pattern = ObjRS("PageURL")
													.IgnoreCase = True
													.Global = True
												End With
												Set ObjRegExpMatch = ObjRegExp.Execute(StrPageContent)																								
													PageRank = PageRank + ObjRegExpMatch.Count
												Set ObjRegExpMatch = Nothing												
											End If
										End If
									End If
								ObjRSPageRank.MoveNext
							Loop
							ObjConn.Execute("UPDATE tblPages SET PageRank = " & PageRank & " WHERE PageURL = '" & Replace(ObjRS("PageURL"),"'","''") & "'")												
							Call GetError()
						End If
					ObjRSPageRank.Close
					Set ObjRSPageRank = Nothing
				ObjRS.MoveNext
			Loop
		End If
	ObjRS.Close
End Function

Function LocateCodeInPage(ThePageURL, TheTagProperty, TheTagDelimiter, ByRef TheCode)
DelimiterFound = False
	ObjXMLHttp.Open "GET", ThePageURL, False
			ObjXMLHttp.Send		
				Call GetError()
				If ObjXMLHttp.ReadyState = 4 Then 
					StrPageContent = ObjXMLHttp.ResponseText
						With ObjRegExp
							.Pattern = TheTagProperty
							.IgnoreCase = True
							.Global = True
						End With
							Set ObjExpMatch = ObjRegExp.Execute(StrPageContent)
								If ObjExpMatch.Count > 0 Then
									For Each Item In ObjExpMatch
										iStartingPos = (Item.FirstIndex + Len(TheTagProperty) + 1)
										Cnt = 1
										DelimiterFound = False
											For i = iStartingPos to Len(StrPageContent)
												For j = 0 to UBound(TheTagDelimiter)															
													If LCase(Mid(StrPageContent,i,Len(TheTagDelimiter(j)))) = LCase(TheTagDelimiter(j)) Then													
														iEndingPos = Cnt - 1
														DelimiterFound = True
														Exit For														
													End If 													
												Next
												If DelimiterFound Then Exit For
												Cnt = Cnt + 1
											Next
										TempCode = Mid(StrPageContent,iStartingPos,iEndingPos)										
										TheCode = TempCode											
									Next	
								End If	
					End If
		ArrayCode = TempArrayCode																					
End Function

Function GetWordDocumentContent(TheDocument)
Set ObjWord = CreateObject("Word.Application")
Call GetError()
ObjWord.Documents.Open TheDocument
	Call GetError()
	Set ObjDocContent = ObjWord.ActiveDocument
		Call GetError()
		StrDocumentContent = ObjDocContent.Content.Text
		StrDocumentContent = Replace(StrDocumentContent,vbCrLf," ")
		PageContent = StrDocumentContent
	Set ObjDocContent = Nothing
	Set ObjDocProps = ObjWord.ActiveDocument.BuiltInDocumentProperties		
		PageTitle = ObjDocProps("Title").Value		
		PageKeywords = ObjDocProps("keywords").Value
		PageDescription = ObjDocProps("Comments").Value	
		Call GetError()
	Set ObjDocProps = Nothing
ObjWord.ActiveDocument.Close wdDoNotSaveChanges
ObjWord.Quit
Set ObjWord = Nothing
End Function

Function StoreLink(TheLink, TheStatus)
	Set TempObjRS = ObjConn.Execute("SELECT PageURL FROM tblURL WHERE PageURL = '" & Replace(TheLink,"'","''") & "'")
	Call GetError()
	If TempObjRS.EOF Then
		ObjConn.Execute("INSERT INTO tblURL (PageURL, PageIndexed, PageStatus) VALUES ('" & Replace(TheLink,"'","''") & "', 0, " & TheStatus & ")")	
		Call GetError()
	End If
	TempObjRS.Close
	Set TempObjRS = Nothing
End Function

Function IndexPage(TheArrayValidDocExtToIndex)
	StrSQL = "SELECT DISTINCT(PageURL) FROM tblURL WHERE PageIndexed = 0 AND PageStatus = 200 GROUP BY PageURL"	
	ObjRS.Open StrSQL, ObjConn, 3, 3
		Call GetError()
		If Not ObjRS.EOF Then
			Do Until ObjRS.EOF
				For e = 0 To UBound(TheArrayValidDocExtToIndex)
					If Right(ObjRS("PageURL"),Len(TheArrayValidDocExtToIndex(e))) = TheArrayValidDocExtToIndex(e) Then
						Call getLinksInPage(SiteURL & ObjRS("PageURL"), "href=", Array(" ",">"), DocExt)
					End If
				Next
				ObjConn.Execute("UPDATE tblURL SET PageIndexed = 1 WHERE PageURL = '" & Replace(ObjRS("PageURL"),"'","''") & "'")
				Call GetError()
			ObjRS.MoveNext
			Loop
		Else
			StopSpider = True
			ObjRS.Close
			Exit Function
		End If
	ObjRS.Close	
End Function