Come partenza potresti usare un codice come questo:
Codice PHP:
<pre>
<?php
function not_empty($content) {
return !empty($content);
}
$url = 'http://www.masteringlandingpages.com/'; // your web page url/path
$content = file_get_contents($url);
$content = utf8_decode($content); // just an example of content preparation
// clean up "strange" tags
$content = preg_replace('#<script[^>]*>.*?</script[^>]*>#si', '', $content);
$content = preg_replace('#<style[^>]*>.*?</style[^>]*>#si', '', $content);
$content = preg_replace('##si', '', $content);
$results = array();
$content = '<root>' . $content . '<root>'; // fake tags for easier regexp
$pattern = '#>([^<]*)<#si';
if (preg_match_all($pattern, $content, $matches)) {
foreach ($matches[1] as $match) {
$match = str_replace('', ' ', $match); // just an example of normalization task
$match = trim($match); // just an example of normalization task
$results[] = $match;
}
}
$results = array_filter($results, 'not_empty'); // remove not empty tags
print_r($results);
?>
</pre>