script to convert <article> elements to <items>

AMA3
Bear Rating Trainee
Bear Rating Trainee
Posts: 5
Joined: 11 Apr 2013, 17:49

script to convert <article> elements to <items>

Postby AMA3 » 19 Apr 2013, 06:43

I was having problems with some feeds being invalid XML (notably feeds for blog.jquery.com and modernizr.com/news) and decided that I would work around the issue by writing a PHP script that scans any HTML page and converts it to a very basic RSS feed, where every <article> element on the page becomes an <item> in the RSS.

You should be able to use this script to convert any HTML page that uses <article>. It's not terribly robust (at least not right now), but I might make it better in the future.

It's easy to install (one step!) and use (one step)!

1. Installation:

Save the following code in a file named articles-to-rss.php (or whatever), perhaps in the root folder of your tt-rss installation:

Code: Select all

<?php
ini_set('display_errors','0');

if ( ! isset($_SERVER['DOCUMENT_ROOT'] ) )
  $_SERVER['DOCUMENT_ROOT'] = str_replace( '\\', '/', substr(
    $_SERVER['SCRIPT_FILENAME'], 0, 0-strlen($_SERVER['PHP_SELF']) ) );

//===========================================================================

function findTitle( $e )
{
  for ( $e = $e->firstChild; $e != null ; $e = $e->nextSibling )
  {
    $name = strtolower($e->nodeName);
    if ( ( strcmp( 'h1', $name ) === 0) || ( strcmp( 'h2', $name ) === 0) || ( strcmp( 'h3', $name ) === 0) ||
      ( strcmp( 'h4', $name ) === 0) || ( strcmp( 'h5', $name ) === 0) || ( strcmp( 'h6', $name ) === 0) )
    {
      return $e->textContent;
    }
    if ( strcmp( 'header', $name ) === 0 )
    {
      $name = findTitle($e);
      if ( $name !== null )
        return $name;
      return $e->textContent;
    }
  }
  return null;
}

//===========================================================================

function stripSpecial( $phrase )
{
  $str = preg_replace(
            '/[^a-z ]/',
            '',
            str_replace(
                '&amp;',
                '',
                strtolower( $phrase ) ) );
  return str_replace( ' ', '', $str );
}


//===========================================================================

if ( ! array_key_exists( 'url', $_REQUEST ) )
  echo "missing required parameter 'url'\r\n";

else // have url
{
  $url = ($_REQUEST['tls']?'https://':'http://').rawurldecode($_REQUEST['url']);
  $urlSafe = htmlspecialchars($url);

  set_time_limit(120);
  header('Content-type: application/rss+xml');
  //header('Content-type: text/plain');

  echo '<rss version="2.0">' . "\r\n";
  echo '<channel>' . "\r\n";
 
  $context = stream_context_create( array( 'http' => array( 'timeout' => 20 )));

  $results = file_get_contents( $url, FALSE, $context );
  if ( strlen( $results ) < 1 )
  {
    echo "<title>missing data from $urlSafe</title>\r\n";
    echo "<link>$urlSafe</link>\r\n";
    echo "<description>$urlSafe did not return any data</description>\r\n";
  }
  else // got something
  {
    $doc = new DOMDocument();
    $doc->loadHTML( $results );

    $title = $url;
    $e = $doc->getElementsByTagName('head');
    if ( ( $e != null ) && ( $e->length > 0 ) )
    {
      $e = $e->item(0)->getElementsByTagName('title');
      if ( ( $e != null ) && ( $e->length > 0 ) )
        $title = $e->item(0)->textContent;
    }

    $titleSafe = htmlspecialchars($title);
    echo "<title>$titleSafe</title>\r\n";
    echo "<link>$urlSafe</link>\r\n";
    echo "<description>Articles from $titleSafe ($urlSafe) converted to RSS.</description>\r\n";

    $articles = $doc->getElementsByTagName('article');
    for ( $a = 0 ; $a < $articles->length ; $a++ )
    {
      $art = $articles->item($a);

      echo "<item>\r\n<title>";

      $title = trim(findTitle( $art ));
      if ( $title == null )
        $title = '[title unknown]';
      $titleStripped = stripSpecial($title);

      echo htmlspecialchars($title) . "</title>\r\n<link>";

      $guid = $null;
      $anchors = $doc->getElementsByTagName('a');
      for ( $h = 0 ; $h < $anchors->length ; $h++ )
      {
        $href = $anchors->item($h)->getAttribute('href');
        if ( ( $href != null ) && ( strlen( $href ) > 0 ) )
        {
          if ( strpos( stripSpecial($href), $titleStripped ) !== false )
          {
            echo $href;
            $guid = $href;
            break; // for ( $h ... )
          }
        }
      }
      if ( $h == $anchors->length )
      {
        echo "$url";
        $id = $art->getAttribute('id');
        if ( ( $id != null ) && ( strlen( $id ) > 0 ) )
          echo "#$id";
      }

      echo "</link>\r\n";
      if ( $guid !== null )
        echo '<guid isPermaLink="true">'.htmlspecialchars($guid)."</guid>\r\n";
      echo "<description>" .  htmlspecialchars($doc->saveHTML($art)) .  "</description>\r\n</item>\r\n";

    } // for ( $a ... )

  } // else // got something

  echo "</channel>\r\n</rss>\r\n";

} // else // have url
?>


2. Subscribe

For any page containing articles that you want to subscribe to like an RSS feed, use TT-RSS (or any other feed reader, for that matter) to subscribe to the URL:

Code: Select all

http://your-server/articles-to-rss.php?url=url_without_scheme


For example, if your server name is example.com and you want to subscribe to http://blog.jquery.com, you should subscribe to this URL:

Code: Select all

http://example.com/articles-to-rss.php?url=blog.jquery.com


If the original page uses HTTPS instead of HTTP, add a tls parameter, such as:

Code: Select all

http://example.com/articles-to-rss.php?tls=&url=blog.jquery.com


That's it! I hope that I won't be the only person to find this useful :)

Return to “Everything else”

Who is online

Users browsing this forum: No registered users and 1 guest