Improved RSS generation

This commit is contained in:
Yann Esposito (Yogsototh) 2019-09-30 15:10:39 +02:00
parent 29d1945cc0
commit 22a2d2bd6c
Signed by untrusted user who does not match committer: yogsototh
GPG Key ID: 7B19A4C650D59646
6 changed files with 267 additions and 66 deletions

111
mkrss.sh
View File

@ -1,53 +1,58 @@
#!/usr/bin/env nix-shell
#!nix-shell -i zsh
rsstpl="rss.tpl"
# Directory
webdir="_site"
postsdir="$webdir/posts"
rssfile="$webdir/rss.xml"
xmlize() {
local fic="$1";
hxclean $fic
}
# maximal number of articles to put in the RSS file
maxarticles=10
# RSS Metas
rsstitle="her.esy.fun"
rssurl="https://her.esy.fun/rss.xml"
websiteurl="https://her.esy.fun"
rssdescription="her.esy.fun articles, mostly random personal thoughts"
rsslang="en"
rssauthor="yann@esposito.host (Yann Esposito)"
rssimgtitle="yogsototh"
rssimgurl="https://her.esy.fun/img/FlatAvatar.png"
# HTML Accessors (similar to CSS accessors)
dateaccessor='.article-date'
contentaccessor='#content'
# title and keyword shouldn't be changed
titleaccessor='title'
keywordsaccessor='meta[name=keywords]::attr(content)'
formatdate() {
# format the date for RSS
local d=$1
LC_TIME=en_US date --date $d +'%a, %d %b %Y %H:%M:%S %z'
}
finddate(){
local fic="$1"
cat $fic | hxselect -c '.article-date'
}
findtitle(){
local fic="$1"
cat $fic | hxselect -c 'h1'
}
getcontent(){
local fic="$1"
cat $fic | hxselect '#content'
}
findkeywords(){
local fic="$1"
cat $fic | hxselect -c '.keywords > code' | sed 's/,//g'
}
finddate(){ < $1 hxselect -c $dateaccessor }
findtitle(){ < $1 hxselect -c $titleaccessor }
getcontent(){ < $1 hxselect $contentaccessor }
findkeywords(){ < $1 hxselect -c $keywordsaccessor | sed 's/,//g' }
mkcategories(){
for keyword in $*; do
printf "\\n<category>%s</category>" $keyword
done
}
realname="Yann Esposito"
website="https://her.esy.fun"
autoload -U colors && colors
tmpdir=$(mktemp -d)
typeset -a dates
dates=( )
for fic in $postsdir/**/*.html; do
printf "%-30s" $(echo "$fic"|sed 's#^'$postsdir'/##')
blogfile="$(echo "$fic"|sed 's#^'$postsdir'/##')"
printf "%-30s" $blogfile
xfic="$tmpdir/$fic.xml"
mkdir -p $(dirname $xfic)
xmlize $fic > $xfic
hxclean $fic > $xfic
d=$(finddate $xfic)
echo -n " [$d]"
rssdate=$(formatdate $d)
@ -55,15 +60,59 @@ for fic in $postsdir/**/*.html; do
keywords=( $(findkeywords $xfic) )
printf ": %-55s" "$title ($keywords)"
categories=$(mkcategories $keywords)
blogfile="$(echo $fic | perl -pe 's#.*?/posts/#/posts/#')"
printf "\\n<item>\\n<title>%s</title>\\n<guid>%s%s</guid>\\n<pubDate>%s</pubDate>%s\\n<description><![CDATA[\\n%s\\n]]></description>\\n</item>\\n\\n" "$title" "$website" "$blogfile" "$rssdate" "$categories" "$(getcontent "$xfic")" >> "$tmpdir/${d}-$(basename $fic).rss"
{ printf "\\n<item>"
printf "\\n<title>%s</title>" "$title"
printf "\\n<guid>%s</guid>" "${websiteurl}/${blogfile}"
printf "\\n<pubDate>%s</pubDate>%s" "$rssdate"
printf "%s" "$categories"
printf "\\n<description><![CDATA[\\n%s\\n]]></description>" "$(getcontent "$xfic")"
printf "\\n</item>\\n\\n"
} >> "$tmpdir/${d}-$(basename $fic).rss"
dates=( $d $dates )
echo " [${fg[green]}OK${reset_color}]"
done
for fic in $(ls $tmpdir/*.rss | sort -r); do
# echo $fic
echo "Publishing"
for fic in $(ls $tmpdir/*.rss | sort -r | head -n $maxarticles ); do
echo "${fic:t}"
cat $fic >> $tmpdir/rss
done
sed "/<!-- LB -->/r $tmpdir/rss" "$rsstpl" > "$rssfile"
rssmaxdate=$(formatdate $(for d in $dates; do echo $d; done | sort -r | head -n 1))
rssbuilddate=$(formatdate $(date))
{
cat <<END
<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:atom="http://www.w3.org/2005/Atom"
xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
xmlns:georss="http://www.georss.org/georss"
xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#"
xmlns:media="http://search.yahoo.com/mrss/"><channel>
<title>${rsstitle}</title>
<atom:link href="${rssurl}" rel="self" type="application/rss+xml" />
<link>${websiteurl}</link>
<description><![CDATA[${rssdescription}]]></description>
<language>${rsslang}</language>
<pubDate>${rssmaxdate}</pubDate>
<lastBuildDate>$rssbuilddate</lastBuildDate>
<generator>mkrss.sh</generator>
<webMaster>${rssauthor}</webMaster>
<image>
<url>${rssimgurl}</url>
<title>${rssimgtitle}</title>
<link>${websiteurl}</link>
</image>
END
cat $tmpdir/rss
cat <<END
</channel>
</rss>
END
} > "$rssfile"
rm -rf $tmpdir
echo "RSS Generated"

View File

@ -227,7 +227,7 @@ Return output file name."
(setq org-publish-project-alist
`(("orgfiles"
:base-directory ,base-dir
:exclude ".*drafts/.*\\|.*/rss.*"
:exclude ".*drafts/.*"
:base-extension "org"
:publishing-directory ,publish-dir
:recursive t

Binary file not shown.

30
rss.tpl
View File

@ -1,30 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:atom="http://www.w3.org/2005/Atom"
xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
xmlns:georss="http://www.georss.org/georss"
xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#"
xmlns:media="http://search.yahoo.com/mrss/"><channel>
<title>her.esy.fun</title>
<atom:link href="https://her.esy.fun/rss.xml" rel="self" type="application/rss+xml" />
<link>https://her.esy.fun</link>
<description><![CDATA[her.esy.fun articles, mostly random personal thoughts]]></description>
<language>en</language>
<pubDate>Mon, 23 Sep 2019 09:59:16 +0200</pubDate>
<lastBuildDate>Mon, 23 Sep 2019 09:59:16 +0200</lastBuildDate>
<generator>Emacs 26.3 Org-mode 9.2.5</generator>
<webMaster>yann@esposito.host (Yann Esposito)</webMaster>
<image>
<url>https://her.esy.fun/img/FlatAvatar.png</url>
<title>her.esy.fun</title>
<link>https://her.esy.fun</link>
</image>
<!-- LB -->
</channel>
</rss>

View File

@ -2,7 +2,8 @@
#+AUTHOR: Yann Esposito
#+EMAIL: yann@esposito.host
#+DESCRIPTION: Articles
- [2019-09-23] *[[file:/Users/esposito/dev/her.esy.fun/src/posts/how-i-internet.org][How I Internet]]* @@html:<div class="keywords">@@@@html:<span class="keyword">@@#blog@@html:</span>@@ @@html:<span class="keyword">@@#minimalism@@html:</span>@@ @@html:<span class="keyword">@@#self-hosting@@html:</span>@@ @@html:<span class="keyword">@@#web@@html:</span>@@ @@html:<span class="keyword">@@#zen@@html:</span>@@@@html:</div>@@@@html:<div class="description">@@How I protect myself against attention grabbers and many social media anti-patterns.@@html:</div>@@
- [2019-08-18] *[[file:/Users/esposito/dev/her.esy.fun/src/posts/project-el/index.org][Autoload Script by project]]* @@html:<div class="keywords">@@@@html:<span class="keyword">@@#blog@@html:</span>@@ @@html:<span class="keyword">@@#org-mode@@html:</span>@@ @@html:<span class="keyword">@@#programming@@html:</span>@@@@html:</div>@@@@html:<div class="description">@@A script I use to load safely an eLISP file when entering a new project directory.@@html:</div>@@
- [2019-08-17] *[[file:/Users/esposito/dev/her.esy.fun/src/posts/troll-2/index.org][Troll 2]]* @@html:<div class="keywords">@@@@html:<span class="keyword">@@#movie@@html:</span>@@@@html:</div>@@@@html:<div class="description">@@I watched what may be the worse movie of all time and I still enjoyed greatly the show.@@html:</div>@@
- [2019-08-17] *[[file:/Users/esposito/dev/her.esy.fun/src/posts/new-blog.org][New Blog]]* @@html:<div class="keywords">@@@@html:<span class="keyword">@@#blog@@html:</span>@@ @@html:<span class="keyword">@@#css@@html:</span>@@ @@html:<span class="keyword">@@#org-mode@@html:</span>@@ @@html:<span class="keyword">@@#programming@@html:</span>@@ @@html:<span class="keyword">@@#web@@html:</span>@@@@html:</div>@@@@html:<div class="description">@@Meta article about how I generate this blog.@@html:</div>@@
- [2019-09-30] *[[file:/Users/yaesposi/y/her.esy.fun/src/posts/rss-gen.org][RSS Generation]]* @@html:<div class="keywords">@@@@html:<span class="keyword">@@#programming@@html:</span>@@ @@html:<span class="keyword">@@#web@@html:</span>@@@@html:</div>@@@@html:<div class="description">@@How I generate RSS feed via command line@@html:</div>@@
- [2019-09-23] *[[file:/Users/yaesposi/y/her.esy.fun/src/posts/how-i-internet.org][How I Internet]]* @@html:<div class="keywords">@@@@html:<span class="keyword">@@#blog@@html:</span>@@ @@html:<span class="keyword">@@#minimalism@@html:</span>@@ @@html:<span class="keyword">@@#self-hosting@@html:</span>@@ @@html:<span class="keyword">@@#web@@html:</span>@@ @@html:<span class="keyword">@@#zen@@html:</span>@@@@html:</div>@@@@html:<div class="description">@@How I protect myself against attention grabbers and many social media anti-patterns.@@html:</div>@@
- [2019-08-18] *[[file:/Users/yaesposi/y/her.esy.fun/src/posts/project-el/index.org][Autoload Script by project]]* @@html:<div class="keywords">@@@@html:<span class="keyword">@@#blog@@html:</span>@@ @@html:<span class="keyword">@@#org-mode@@html:</span>@@ @@html:<span class="keyword">@@#programming@@html:</span>@@@@html:</div>@@@@html:<div class="description">@@A script I use to load safely an eLISP file when entering a new project directory.@@html:</div>@@
- [2019-08-17] *[[file:/Users/yaesposi/y/her.esy.fun/src/posts/troll-2/index.org][Troll 2]]* @@html:<div class="keywords">@@@@html:<span class="keyword">@@#movie@@html:</span>@@@@html:</div>@@@@html:<div class="description">@@I watched what may be the worse movie of all time and I still enjoyed greatly the show.@@html:</div>@@
- [2019-08-17] *[[file:/Users/yaesposi/y/her.esy.fun/src/posts/new-blog.org][New Blog]]* @@html:<div class="keywords">@@@@html:<span class="keyword">@@#blog@@html:</span>@@ @@html:<span class="keyword">@@#css@@html:</span>@@ @@html:<span class="keyword">@@#org-mode@@html:</span>@@ @@html:<span class="keyword">@@#programming@@html:</span>@@ @@html:<span class="keyword">@@#web@@html:</span>@@@@html:</div>@@@@html:<div class="description">@@Meta article about how I generate this blog.@@html:</div>@@

181
src/posts/rss-gen.org Normal file
View File

@ -0,0 +1,181 @@
#+TITLE: RSS Generation
#+SUBTITLE: How to generate RSS feed via command line
#+AUTHOR: Yann Esposito
#+EMAIL: yann@esposito.host
#+DATE: [2019-09-30 Mon]
#+KEYWORDS: programming, web
#+DESCRIPTION: How I generate RSS feed via command line
#+OPTIONS: auto-id:t
#+begin_notes
TL;DR: To generate an RSS file you need to provide many metadatas.
Those metadata are not part of all HTML files.
So generating RSS from a tree of HTML file is not straightforward.
Here is the script I use.
#+end_notes
* RSS Problem
:PROPERTIES:
:CUSTOM_ID: rss-problem
:END:
RSS feed is meant to declare updates and new articles for a website.
Each RSS entry must therefore have a date, an unique id, a title, maybe
some categories, etc...
For most blog platform or even static website generator, those meta infos
are clearly put in the sources or in some DB.
I use =org-mode= for generating my website, and the =ox-rss= is quite slow
when generating an RSS with the full content of each item.
Mainly, the way to achieve full content of my articles inside an RSS with
=ox-rss= is by first creating a very big org file containing all the
articles, and then transforming it in RSS. And this is very slow (many minutes).
So a simpler idea inspired by lb[fn:lb] is to generate the RSS directly
from the generated HTML files.
The only difficulty is to find the metadata inside those HTML.
Unfortunately there is no real standard for all those metas.
Has there is no standard place to have all those meta informations inside
an HTML file in order to use the HTML as source you'll need to "parse" the
HTML file.
For that purpose I use =html-xml-utils=.
Here is the full script I use
#+begin_src bash
#!/usr/bin/env nix-shell
#!nix-shell -i zsh
# Directory
webdir="_site"
postsdir="$webdir/posts"
rssfile="$webdir/rss.xml"
# maximal number of articles to put in the RSS file
maxarticles=10
# RSS Metas
rsstitle="her.esy.fun"
rssurl="https://her.esy.fun/rss.xml"
websiteurl="https://her.esy.fun"
rssdescription="her.esy.fun articles, mostly random personal thoughts"
rsslang="en"
rssauthor="yann@esposito.host (Yann Esposito)"
rssimgtitle="yogsototh"
rssimgurl="https://her.esy.fun/img/FlatAvatar.png"
# HTML Accessors (similar to CSS accessors)
dateaccessor='.article-date'
contentaccessor='#content'
# title and keyword shouldn't be changed
titleaccessor='title'
keywordsaccessor='meta[name=keywords]::attr(content)'
formatdate() {
# format the date for RSS
local d=$1
LC_TIME=en_US date --date $d +'%a, %d %b %Y %H:%M:%S %z'
}
finddate(){ < $1 hxselect -c $dateaccessor }
findtitle(){ < $1 hxselect -c $titleaccessor }
getcontent(){ < $1 hxselect $contentaccessor }
findkeywords(){ < $1 hxselect -c $keywordsaccessor | sed 's/,//g' }
mkcategories(){
for keyword in $*; do
printf "\\n<category>%s</category>" $keyword
done
}
autoload -U colors && colors
tmpdir=$(mktemp -d)
typeset -a dates
dates=( )
for fic in $postsdir/**/*.html; do
blogfile="$(echo "$fic"|sed 's#^'$postsdir'/##')"
printf "%-30s" $blogfile
xfic="$tmpdir/$fic.xml"
mkdir -p $(dirname $xfic)
hxclean $fic > $xfic
d=$(finddate $xfic)
echo -n " [$d]"
rssdate=$(formatdate $d)
title=$(findtitle $xfic)
keywords=( $(findkeywords $xfic) )
printf ": %-55s" "$title ($keywords)"
categories=$(mkcategories $keywords)
{ printf "\\n<item>"
printf "\\n<title>%s</title>" "$title"
printf "\\n<guid>%s</guid>" "${websiteurl}/${blogfile}"
printf "\\n<pubDate>%s</pubDate>%s" "$rssdate"
printf "%s" "$categories"
printf "\\n<description><![CDATA[\\n%s\\n]]></description>" "$(getcontent "$xfic")"
printf "\\n</item>\\n\\n"
} >> "$tmpdir/${d}-$(basename $fic).rss"
dates=( $d $dates )
echo " [${fg[green]}OK${reset_color}]"
done
echo "Publishing"
for fic in $(ls $tmpdir/*.rss | sort -r | head -n $maxarticles ); do
echo "${fic:t}"
cat $fic >> $tmpdir/rss
done
rssmaxdate=$(formatdate $(for d in $dates; do echo $d; done | sort -r | head -n 1))
rssbuilddate=$(formatdate $(date))
{
cat <<END
<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:atom="http://www.w3.org/2005/Atom"
xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
xmlns:georss="http://www.georss.org/georss"
xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#"
xmlns:media="http://search.yahoo.com/mrss/"><channel>
<title>${rsstitle}</title>
<atom:link href="${rssurl}" rel="self" type="application/rss+xml" />
<link>${websiteurl}</link>
<description><![CDATA[${rssdescription}]]></description>
<language>${rsslang}</language>
<pubDate>${rssmaxdate}</pubDate>
<lastBuildDate>$rssbuilddate</lastBuildDate>
<generator>mkrss.sh</generator>
<webMaster>${rssauthor}</webMaster>
<image>
<url>${rssimgurl}</url>
<title>${rssimgtitle}</title>
<link>${websiteurl}</link>
</image>
END
cat $tmpdir/rss
cat <<END
</channel>
</rss>
END
} > "$rssfile"
rm -rf $tmpdir
echo "RSS Generated"
#+end_src
The =nix-shell= bang pattern is a neat trick to have all the dependencies I
need when running my script, I could have added zsh, but my main concern
was about =html-xml-utils=.
Along my script I have a =shell.nix= file containing:
#+begin_src nix
{ pkgs ? import (fetchTarball https://github.com/NixOS/nixpkgs/archive/19.09-beta.tar.gz) {} }:
pkgs.mkShell {
buildInputs = [ pkgs.html-xml-utils ];
}
#+end_src
[fn:lb] https://github.com/LukeSmithxyz/lb