codice:'******************************* ' COSE DA FARE: ' Lettura dai file EXCEL ' Lettura dai file PDF '******************************* Option Explicit On Error Resume Next Dim ThePageURL, TheTagPattern, TheArrayEndingPattern, StopSpider, URLStatus, DocExt, ObjWord, ArrayStringsToRemove, TitleToRemove Dim StrPageContent, Item, iStartingTagPattern, Cnt, i, j, TheArrayDocumentsExtension, ObjDocContent, TheTextToRemove, wdDoNotSaveChanges Dim iEndingTagPattern, TempURL, SiteURL, objRegExpMatch, Element, ValidDocExtToIndex, StrDocumentContent, ArrayFileToRemove, PageRank Dim ObjConn, ObjRS, TempObjRS, ObjRSPageRank, StrSQL, StrSQLPageRank, StrSQLInsert, ObjXMLHttp, ObjRegExp, ObjExpMatch, bLinkFound, TheContent, TheURL Dim iYear, iMonth, iDay, iHour, iMinutes, iSeconds, DateTime, e, iStartingPos, iEndingPos, ArrayCode, ArrayStrings, SkipURL Dim PageTitle, PageKeywords, PageDescription, PageContent, DelimiterFound, TempCode, TempArrayCode, TheText, k, ObjDocProps Dim ObjMail,ObjConf, Flds, Smtp, Port, Authenticate, MailUser, MailPass, MailRecipient, MailSender Function GetError() If Err <> 0 Then Call NotifyErrorByEmail(Err.Description, Err.Line) End If End Function Function NotifyErrorByEmail(TheErrorDescription, TheErrorLine) Set ObjMail = CreateObject("CDO.Message") Set ObjConf = CreateObject("CDO.Configuration") Set ObjFlds = ObjConf.Fields ObjFlds("http://schemas.microsoft.com/cdo/configuration/sendusing") = 2 ObjFlds("http://schemas.microsoft.com/cdo/configuration/smtpserver") = Smtp ObjFlds("http://schemas.microsoft.com/cdo/configuration/smtpserverport") = Port ObjFlds("http://schemas.microsoft.com/cdo/configuration/smtpauthenticate") = Authenticate If Authenticate = 1 Then Flds("http://schemas.microsoft.com/cdo/configuration/sendusername") = MailUser Flds("http://schemas.microsoft.com/cdo/configuration/sendpassword") = MailPass End If ObjFlds.Update() Set ObjMail.Configuration = ObjConf ObjMail.to = MailRecipient ObjMail.from = MailSender ObjMail.subject = TitleToRemove & "Errore nell'indicizzazione del sito: " & SiteURL ObjMail.textBody = "Sito Web: " & TitleToRemove & SiteURL & vbCrLf &_ "Descrizione errore: " & TheErrorDescription & vbCrLf &_ "Riga errore: " & TheErrorLine & vbCrLf &_ "Data/ora: " & formatDateTime(now(), vbGeneralDate) & vbCrLf ObjMail.Send() Set ObjFlds = Nothing Set ObjConf = Nothing Set ObjMail = Nothing End Function Function GetLinksInPage(ThePageURL, TheTagPattern, TheArrayEndingPattern, TheArrayDocumentsExtension) bLinkFound = False ObjXMLHttp.Open "GET", ThePageURL, False ObjXMLHttp.Send Call GetError() URLStatus = CInt(ObjXMLHttp.Status) If URLStatus = 200 Then If ObjXMLHttp.ReadyState = 4 Then StrPageContent = ObjXMLHttp.ResponseText With ObjRegExp .Pattern = TheTagPattern .IgnoreCase = True .Global = True End With Set ObjRegExpMatch = ObjRegExp.Execute(StrPageContent) If ObjRegExpMatch.Count > 0 Then For Each Item In ObjRegExpMatch iStartingTagPattern = (Item.FirstIndex + Len(TheTagPattern) + 1) Cnt = 1 For i = iStartingTagPattern To Len(StrPageContent) bLinkFound = False For j = 0 To UBound(TheArrayEndingPattern) If LCase(Mid(StrPageContent,i,Len(TheArrayEndingPattern(j)))) = LCase(TheArrayEndingPattern(j)) Then iEndingTagPattern = Cnt - 1 bLinkFound = True Exit For End If Next If bLinkFound Then Exit For Cnt = Cnt + 1 Next TempURL = Mid(StrPageContent,iStartingTagPattern,iEndingTagPattern) TempURL = Replace(TempURL,"""","") If InStrRev(TempURL,"#") > 0 Then TempURL = Left(TempURL,InStrRev(TempURL,"#") -1) End If TempURL = CheckDomainURL(TempURL) If Not SkipURL Then For Each Element In TheArrayDocumentsExtension If Right(TempURL,Len(Element)) = Element Then Call StoreLink(TempURL,URLStatus) End If Next End If Next End If Set ObjRegExpMatch = Nothing End If End If End Function Function BuildPageRank() StrSQL = "SELECT PageURL FROM tblPages" ObjRS.Open StrSQL, ObjConn, 3, 3 Call GetError() If Not ObjRS.EOF Then Do Until ObjRS.EOF PageRank = 0 StrSQLPageRank = "SELECT PageURL FROM tblPages WHERE PageURL != '" & Replace(ObjRS("PageURL"),"'","''") & "'" Set ObjRSPageRank = ObjConn.Execute(StrSQLPageRank) Call GetError() If Not ObjRSPageRank.EOF Then Do Until ObjRSPageRank.EOF TempURL = ObjRSPageRank("PageURL") If Right(TempURL,5) = ".html" Then ObjXMLHttp.Open "GET", SiteURL & TempURL, False ObjXMLHttp.Send Call GetError() URLStatus = CInt(ObjXMLHttp.Status) If URLStatus = 200 Then If ObjXMLHttp.ReadyState = 4 Then StrPageContent = ObjXMLHttp.ResponseText With ObjRegExp .Pattern = ObjRS("PageURL") .IgnoreCase = True .Global = True End With Set ObjRegExpMatch = ObjRegExp.Execute(StrPageContent) PageRank = PageRank + ObjRegExpMatch.Count Set ObjRegExpMatch = Nothing End If End If End If ObjRSPageRank.MoveNext Loop ObjConn.Execute("UPDATE tblPages SET PageRank = " & PageRank & " WHERE PageURL = '" & Replace(ObjRS("PageURL"),"'","''") & "'") Call GetError() End If ObjRSPageRank.Close Set ObjRSPageRank = Nothing ObjRS.MoveNext Loop End If ObjRS.Close End Function Function LocateCodeInPage(ThePageURL, TheTagProperty, TheTagDelimiter, ByRef TheCode) DelimiterFound = False ObjXMLHttp.Open "GET", ThePageURL, False ObjXMLHttp.Send Call GetError() If ObjXMLHttp.ReadyState = 4 Then StrPageContent = ObjXMLHttp.ResponseText With ObjRegExp .Pattern = TheTagProperty .IgnoreCase = True .Global = True End With Set ObjExpMatch = ObjRegExp.Execute(StrPageContent) If ObjExpMatch.Count > 0 Then For Each Item In ObjExpMatch iStartingPos = (Item.FirstIndex + Len(TheTagProperty) + 1) Cnt = 1 DelimiterFound = False For i = iStartingPos to Len(StrPageContent) For j = 0 to UBound(TheTagDelimiter) If LCase(Mid(StrPageContent,i,Len(TheTagDelimiter(j)))) = LCase(TheTagDelimiter(j)) Then iEndingPos = Cnt - 1 DelimiterFound = True Exit For End If Next If DelimiterFound Then Exit For Cnt = Cnt + 1 Next TempCode = Mid(StrPageContent,iStartingPos,iEndingPos) TheCode = TempCode Next End If End If ArrayCode = TempArrayCode End Function Function GetWordDocumentContent(TheDocument) Set ObjWord = CreateObject("Word.Application") Call GetError() ObjWord.Documents.Open TheDocument Call GetError() Set ObjDocContent = ObjWord.ActiveDocument Call GetError() StrDocumentContent = ObjDocContent.Content.Text StrDocumentContent = Replace(StrDocumentContent,vbCrLf," ") PageContent = StrDocumentContent Set ObjDocContent = Nothing Set ObjDocProps = ObjWord.ActiveDocument.BuiltInDocumentProperties PageTitle = ObjDocProps("Title").Value PageKeywords = ObjDocProps("keywords").Value PageDescription = ObjDocProps("Comments").Value Call GetError() Set ObjDocProps = Nothing ObjWord.ActiveDocument.Close wdDoNotSaveChanges ObjWord.Quit Set ObjWord = Nothing End Function Function StoreLink(TheLink, TheStatus) Set TempObjRS = ObjConn.Execute("SELECT PageURL FROM tblURL WHERE PageURL = '" & Replace(TheLink,"'","''") & "'") Call GetError() If TempObjRS.EOF Then ObjConn.Execute("INSERT INTO tblURL (PageURL, PageIndexed, PageStatus) VALUES ('" & Replace(TheLink,"'","''") & "', 0, " & TheStatus & ")") Call GetError() End If TempObjRS.Close Set TempObjRS = Nothing End Function Function IndexPage(TheArrayValidDocExtToIndex) StrSQL = "SELECT DISTINCT(PageURL) FROM tblURL WHERE PageIndexed = 0 AND PageStatus = 200 GROUP BY PageURL" ObjRS.Open StrSQL, ObjConn, 3, 3 Call GetError() If Not ObjRS.EOF Then Do Until ObjRS.EOF For e = 0 To UBound(TheArrayValidDocExtToIndex) If Right(ObjRS("PageURL"),Len(TheArrayValidDocExtToIndex(e))) = TheArrayValidDocExtToIndex(e) Then Call getLinksInPage(SiteURL & ObjRS("PageURL"), "href=", Array(" ",">"), DocExt) End If Next ObjConn.Execute("UPDATE tblURL SET PageIndexed = 1 WHERE PageURL = '" & Replace(ObjRS("PageURL"),"'","''") & "'") Call GetError() ObjRS.MoveNext Loop Else StopSpider = True ObjRS.Close Exit Function End If ObjRS.Close End Function