From 526a8f9db07e94ace016150a6439b9b1db441456 Mon Sep 17 00:00:00 2001 From: "Yann Esposito (Yogsototh)" Date: Mon, 30 Sep 2019 15:54:43 +0200 Subject: [PATCH] Added RSS gen article --- src/posts/rss-gen.org | 167 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 165 insertions(+), 2 deletions(-) diff --git a/src/posts/rss-gen.org b/src/posts/rss-gen.org index 5927aa4..006828e 100644 --- a/src/posts/rss-gen.org +++ b/src/posts/rss-gen.org @@ -40,9 +40,167 @@ Unfortunately there is no real standard for all those metas. Has there is no standard place to have all those meta informations inside an HTML file in order to use the HTML as source you'll need to "parse" the HTML file. -For that purpose I use =html-xml-utils=. +For that purpose I use =html-xml-utils=[fn:hu]. -Here is the full script I use +I wrote a simple zsh script; it starts with lot of variables to fill: + + +#+begin_src bash +# Directory +webdir="_site" # directory containing your website html files +postsdir="$webdir/posts" # directory containing the articles +rssfile="$webdir/rss.xml" # the RSS file to generate + +# maximal number of articles to put in the RSS file +maxarticles=10 + +# RSS Metas +rsstitle="her.esy.fun" +rssurl="https://her.esy.fun/rss.xml" +websiteurl="https://her.esy.fun" +rssdescription="her.esy.fun articles, mostly random personal thoughts" +rsslang="en" +rssauthor="yann@esposito.host (Yann Esposito)" +rssimgtitle="yogsototh" +rssimgurl="https://her.esy.fun/img/FlatAvatar.png" +#+end_src + +Then I set the accessor to extract the information I want from HTML files. +It is quite unfortunate that there is no really strong convention for where +to put article dates, article author email. +There are metas for title and keywords thought. + +#+begin_src bash +# HTML Accessors (similar to CSS accessors) +dateaccessor='.article-date' +contentaccessor='#content' +# title and keyword shouldn't be changed +titleaccessor='title' +keywordsaccessor='meta[name=keywords]::attr(content)' +#+end_src + +A few helper functions: + +#+begin_src bash +formatdate() { + # format the date for RSS + local d=$1 + LC_TIME=en_US date --date $d +'%a, %d %b %Y %H:%M:%S %z' +} + +finddate(){ < $1 hxselect -c $dateaccessor } +findtitle(){ < $1 hxselect -c $titleaccessor } +getcontent(){ < $1 hxselect $contentaccessor } +findkeywords(){ < $1 hxselect -c $keywordsaccessor | sed 's/,//g' } +mkcategories(){ + for keyword in $*; do + printf "\\n%s" $keyword + done +} +#+end_src + +The =mkcategories= will be used to add an RSS category for each keyword. +And finally the real loop doing the work: + +#+begin_src bash +tmpdir=$(mktemp -d) # create a temporary work dir +typeset -a dates # an array to save dates of all articles +dates=( ) + +# for each HTML file we generate the XML for the item in a file +# named ${d}-$(basename $fic).rss that naming convention will be useful to +# sort article by date +for fic in $postsdir/**/*.html; do + blogfile="$(echo "$fic"|sed 's#^'$postsdir'/##')" + printf "%-30s" $blogfile + xfic="$tmpdir/$fic.xml" + mkdir -p $(dirname $xfic) + hxclean $fic > $xfic # create a cleaner HTML file to help hxselect work + d=$(finddate $xfic) + echo -n " [$d]" + rssdate=$(formatdate $d) + title=$(findtitle $xfic) + keywords=( $(findkeywords $xfic) ) + printf ": %-55s" "$title ($keywords)" + # up until here, we extracted the informations we need for the item + categories=$(mkcategories $keywords) + { printf "\\n" + printf "\\n%s" "$title" + printf "\\n%s" "${websiteurl}/${blogfile}" + printf "\\n%s%s" "$rssdate" + printf "%s" "$categories" + printf "\\n" "$(getcontent "$xfic")" + printf "\\n\\n\\n" + } >> "$tmpdir/${d}-$(basename $fic).rss" + # we append the date to the list of dates + dates=( $d $dates ) + echo " [${fg[green]}OK${reset_color}]" +done + +# Now we publish the items in reverse newer articles first +echo "Publishing" +for fic in $(ls $tmpdir/*.rss | sort -r | head -n $maxarticles ); do + echo "${fic:t}" + cat $fic >> $tmpdir/rss +done + +# we get the latest publish date +rssmaxdate=$(formatdate $(for d in $dates; do echo $d; done | sort -r | head -n 1)) +# we put the current date for the latest build date +rssbuilddate=$(formatdate $(date)) + +# we generate the RSS file +{ +# Write the preamble of the RSS file +cat < + + ${rsstitle} + + ${websiteurl} + + ${rsslang} + ${rssmaxdate} + $rssbuilddate + mkrss.sh + ${rssauthor} + + ${rssimgurl} + ${rssimgtitle} + ${websiteurl} + +END + +# write all items +cat $tmpdir/rss + +# close the RSS file +cat < + +END +} > "$rssfile" + +# cleanup temporary directory +rm -rf $tmpdir +echo "RSS Generated" +#+end_src + +** Full script +:PROPERTIES: +:CUSTOM_ID: full-script +:END: + +Here is the full script I use: #+begin_src bash #!/usr/bin/env nix-shell @@ -178,4 +336,9 @@ Along my script I have a =shell.nix= file containing: } #+end_src +If you are not already using nix[fn:nix] you should really take a look. +That =shell.nix= will work on Linux and MacOS. + [fn:lb] https://github.com/LukeSmithxyz/lb +[fn:hu] https://www.w3.org/Tools/HTML-XML-utils/ +[fn:nix] https://nixos.org/nix