diff --git a/mkrss.sh b/mkrss.sh index 6c11b90..b18e822 100755 --- a/mkrss.sh +++ b/mkrss.sh @@ -1,53 +1,58 @@ #!/usr/bin/env nix-shell #!nix-shell -i zsh -rsstpl="rss.tpl" +# Directory webdir="_site" postsdir="$webdir/posts" rssfile="$webdir/rss.xml" -xmlize() { - local fic="$1"; - hxclean $fic -} +# maximal number of articles to put in the RSS file +maxarticles=10 + +# RSS Metas +rsstitle="her.esy.fun" +rssurl="https://her.esy.fun/rss.xml" +websiteurl="https://her.esy.fun" +rssdescription="her.esy.fun articles, mostly random personal thoughts" +rsslang="en" +rssauthor="yann@esposito.host (Yann Esposito)" +rssimgtitle="yogsototh" +rssimgurl="https://her.esy.fun/img/FlatAvatar.png" + +# HTML Accessors (similar to CSS accessors) +dateaccessor='.article-date' +contentaccessor='#content' +# title and keyword shouldn't be changed +titleaccessor='title' +keywordsaccessor='meta[name=keywords]::attr(content)' formatdate() { + # format the date for RSS local d=$1 LC_TIME=en_US date --date $d +'%a, %d %b %Y %H:%M:%S %z' } -finddate(){ - local fic="$1" - cat $fic | hxselect -c '.article-date' -} -findtitle(){ - local fic="$1" - cat $fic | hxselect -c 'h1' -} -getcontent(){ - local fic="$1" - cat $fic | hxselect '#content' -} -findkeywords(){ - local fic="$1" - cat $fic | hxselect -c '.keywords > code' | sed 's/,//g' -} + +finddate(){ < $1 hxselect -c $dateaccessor } +findtitle(){ < $1 hxselect -c $titleaccessor } +getcontent(){ < $1 hxselect $contentaccessor } +findkeywords(){ < $1 hxselect -c $keywordsaccessor | sed 's/,//g' } mkcategories(){ for keyword in $*; do printf "\\n%s" $keyword done } -realname="Yann Esposito" -website="https://her.esy.fun" - autoload -U colors && colors tmpdir=$(mktemp -d) +typeset -a dates +dates=( ) for fic in $postsdir/**/*.html; do - printf "%-30s" $(echo "$fic"|sed 's#^'$postsdir'/##') + blogfile="$(echo "$fic"|sed 's#^'$postsdir'/##')" + printf "%-30s" $blogfile xfic="$tmpdir/$fic.xml" mkdir -p $(dirname $xfic) - xmlize $fic > $xfic + hxclean $fic > $xfic d=$(finddate $xfic) echo -n " [$d]" rssdate=$(formatdate $d) @@ -55,15 +60,59 @@ for fic in $postsdir/**/*.html; do keywords=( $(findkeywords $xfic) ) printf ": %-55s" "$title ($keywords)" categories=$(mkcategories $keywords) - blogfile="$(echo $fic | perl -pe 's#.*?/posts/#/posts/#')" - printf "\\n\\n%s\\n%s%s\\n%s%s\\n\\n\\n\\n" "$title" "$website" "$blogfile" "$rssdate" "$categories" "$(getcontent "$xfic")" >> "$tmpdir/${d}-$(basename $fic).rss" + { printf "\\n" + printf "\\n%s" "$title" + printf "\\n%s" "${websiteurl}/${blogfile}" + printf "\\n%s%s" "$rssdate" + printf "%s" "$categories" + printf "\\n" "$(getcontent "$xfic")" + printf "\\n\\n\\n" + } >> "$tmpdir/${d}-$(basename $fic).rss" + dates=( $d $dates ) echo " [${fg[green]}OK${reset_color}]" done -for fic in $(ls $tmpdir/*.rss | sort -r); do - # echo $fic +echo "Publishing" +for fic in $(ls $tmpdir/*.rss | sort -r | head -n $maxarticles ); do + echo "${fic:t}" cat $fic >> $tmpdir/rss done -sed "//r $tmpdir/rss" "$rsstpl" > "$rssfile" +rssmaxdate=$(formatdate $(for d in $dates; do echo $d; done | sort -r | head -n 1)) +rssbuilddate=$(formatdate $(date)) +{ +cat < + + ${rsstitle} + + ${websiteurl} + + ${rsslang} + ${rssmaxdate} + $rssbuilddate + mkrss.sh + ${rssauthor} + + ${rssimgurl} + ${rssimgtitle} + ${websiteurl} + +END +cat $tmpdir/rss +cat < + +END +} > "$rssfile" + rm -rf $tmpdir echo "RSS Generated" diff --git a/project.el b/project.el index a453782..b9da87d 100644 --- a/project.el +++ b/project.el @@ -227,7 +227,7 @@ Return output file name." (setq org-publish-project-alist `(("orgfiles" :base-directory ,base-dir - :exclude ".*drafts/.*\\|.*/rss.*" + :exclude ".*drafts/.*" :base-extension "org" :publishing-directory ,publish-dir :recursive t diff --git a/project.el.sig b/project.el.sig index 6034941..2fd27c4 100644 Binary files a/project.el.sig and b/project.el.sig differ diff --git a/rss.tpl b/rss.tpl deleted file mode 100644 index fe03740..0000000 --- a/rss.tpl +++ /dev/null @@ -1,30 +0,0 @@ - - - her.esy.fun - - https://her.esy.fun - - en - Mon, 23 Sep 2019 09:59:16 +0200 - Mon, 23 Sep 2019 09:59:16 +0200 - Emacs 26.3 Org-mode 9.2.5 - yann@esposito.host (Yann Esposito) - - https://her.esy.fun/img/FlatAvatar.png - her.esy.fun - https://her.esy.fun - - - - - - diff --git a/src/archive.org b/src/archive.org index d6d70e3..dab7cac 100644 --- a/src/archive.org +++ b/src/archive.org @@ -2,7 +2,8 @@ #+AUTHOR: Yann Esposito #+EMAIL: yann@esposito.host #+DESCRIPTION: Articles -- [2019-09-23] *[[file:/Users/esposito/dev/her.esy.fun/src/posts/how-i-internet.org][How I Internet]]* @@html:
@@@@html:@@#blog@@html:@@ @@html:@@#minimalism@@html:@@ @@html:@@#self-hosting@@html:@@ @@html:@@#web@@html:@@ @@html:@@#zen@@html:@@@@html:
@@@@html:
@@How I protect myself against attention grabbers and many social media anti-patterns.@@html:
@@ -- [2019-08-18] *[[file:/Users/esposito/dev/her.esy.fun/src/posts/project-el/index.org][Autoload Script by project]]* @@html:
@@@@html:@@#blog@@html:@@ @@html:@@#org-mode@@html:@@ @@html:@@#programming@@html:@@@@html:
@@@@html:
@@A script I use to load safely an eLISP file when entering a new project directory.@@html:
@@ -- [2019-08-17] *[[file:/Users/esposito/dev/her.esy.fun/src/posts/troll-2/index.org][Troll 2]]* @@html:
@@@@html:@@#movie@@html:@@@@html:
@@@@html:
@@I watched what may be the worse movie of all time and I still enjoyed greatly the show.@@html:
@@ -- [2019-08-17] *[[file:/Users/esposito/dev/her.esy.fun/src/posts/new-blog.org][New Blog]]* @@html:
@@@@html:@@#blog@@html:@@ @@html:@@#css@@html:@@ @@html:@@#org-mode@@html:@@ @@html:@@#programming@@html:@@ @@html:@@#web@@html:@@@@html:
@@@@html:
@@Meta article about how I generate this blog.@@html:
@@ \ No newline at end of file +- [2019-09-30] *[[file:/Users/yaesposi/y/her.esy.fun/src/posts/rss-gen.org][RSS Generation]]* @@html:
@@@@html:@@#programming@@html:@@ @@html:@@#web@@html:@@@@html:
@@@@html:
@@How I generate RSS feed via command line@@html:
@@ +- [2019-09-23] *[[file:/Users/yaesposi/y/her.esy.fun/src/posts/how-i-internet.org][How I Internet]]* @@html:
@@@@html:@@#blog@@html:@@ @@html:@@#minimalism@@html:@@ @@html:@@#self-hosting@@html:@@ @@html:@@#web@@html:@@ @@html:@@#zen@@html:@@@@html:
@@@@html:
@@How I protect myself against attention grabbers and many social media anti-patterns.@@html:
@@ +- [2019-08-18] *[[file:/Users/yaesposi/y/her.esy.fun/src/posts/project-el/index.org][Autoload Script by project]]* @@html:
@@@@html:@@#blog@@html:@@ @@html:@@#org-mode@@html:@@ @@html:@@#programming@@html:@@@@html:
@@@@html:
@@A script I use to load safely an eLISP file when entering a new project directory.@@html:
@@ +- [2019-08-17] *[[file:/Users/yaesposi/y/her.esy.fun/src/posts/troll-2/index.org][Troll 2]]* @@html:
@@@@html:@@#movie@@html:@@@@html:
@@@@html:
@@I watched what may be the worse movie of all time and I still enjoyed greatly the show.@@html:
@@ +- [2019-08-17] *[[file:/Users/yaesposi/y/her.esy.fun/src/posts/new-blog.org][New Blog]]* @@html:
@@@@html:@@#blog@@html:@@ @@html:@@#css@@html:@@ @@html:@@#org-mode@@html:@@ @@html:@@#programming@@html:@@ @@html:@@#web@@html:@@@@html:
@@@@html:
@@Meta article about how I generate this blog.@@html:
@@ \ No newline at end of file diff --git a/src/posts/rss-gen.org b/src/posts/rss-gen.org new file mode 100644 index 0000000..5927aa4 --- /dev/null +++ b/src/posts/rss-gen.org @@ -0,0 +1,181 @@ +#+TITLE: RSS Generation +#+SUBTITLE: How to generate RSS feed via command line +#+AUTHOR: Yann Esposito +#+EMAIL: yann@esposito.host +#+DATE: [2019-09-30 Mon] +#+KEYWORDS: programming, web +#+DESCRIPTION: How I generate RSS feed via command line +#+OPTIONS: auto-id:t + +#+begin_notes +TL;DR: To generate an RSS file you need to provide many metadatas. +Those metadata are not part of all HTML files. +So generating RSS from a tree of HTML file is not straightforward. +Here is the script I use. +#+end_notes + +* RSS Problem +:PROPERTIES: +:CUSTOM_ID: rss-problem +:END: + +RSS feed is meant to declare updates and new articles for a website. +Each RSS entry must therefore have a date, an unique id, a title, maybe +some categories, etc... + +For most blog platform or even static website generator, those meta infos +are clearly put in the sources or in some DB. + +I use =org-mode= for generating my website, and the =ox-rss= is quite slow +when generating an RSS with the full content of each item. +Mainly, the way to achieve full content of my articles inside an RSS with +=ox-rss= is by first creating a very big org file containing all the +articles, and then transforming it in RSS. And this is very slow (many minutes). + +So a simpler idea inspired by lb[fn:lb] is to generate the RSS directly +from the generated HTML files. +The only difficulty is to find the metadata inside those HTML. +Unfortunately there is no real standard for all those metas. + +Has there is no standard place to have all those meta informations inside +an HTML file in order to use the HTML as source you'll need to "parse" the +HTML file. +For that purpose I use =html-xml-utils=. + +Here is the full script I use + +#+begin_src bash +#!/usr/bin/env nix-shell +#!nix-shell -i zsh + +# Directory +webdir="_site" +postsdir="$webdir/posts" +rssfile="$webdir/rss.xml" + +# maximal number of articles to put in the RSS file +maxarticles=10 + +# RSS Metas +rsstitle="her.esy.fun" +rssurl="https://her.esy.fun/rss.xml" +websiteurl="https://her.esy.fun" +rssdescription="her.esy.fun articles, mostly random personal thoughts" +rsslang="en" +rssauthor="yann@esposito.host (Yann Esposito)" +rssimgtitle="yogsototh" +rssimgurl="https://her.esy.fun/img/FlatAvatar.png" + +# HTML Accessors (similar to CSS accessors) +dateaccessor='.article-date' +contentaccessor='#content' +# title and keyword shouldn't be changed +titleaccessor='title' +keywordsaccessor='meta[name=keywords]::attr(content)' + +formatdate() { + # format the date for RSS + local d=$1 + LC_TIME=en_US date --date $d +'%a, %d %b %Y %H:%M:%S %z' +} + +finddate(){ < $1 hxselect -c $dateaccessor } +findtitle(){ < $1 hxselect -c $titleaccessor } +getcontent(){ < $1 hxselect $contentaccessor } +findkeywords(){ < $1 hxselect -c $keywordsaccessor | sed 's/,//g' } +mkcategories(){ + for keyword in $*; do + printf "\\n%s" $keyword + done +} + +autoload -U colors && colors + +tmpdir=$(mktemp -d) +typeset -a dates +dates=( ) +for fic in $postsdir/**/*.html; do + blogfile="$(echo "$fic"|sed 's#^'$postsdir'/##')" + printf "%-30s" $blogfile + xfic="$tmpdir/$fic.xml" + mkdir -p $(dirname $xfic) + hxclean $fic > $xfic + d=$(finddate $xfic) + echo -n " [$d]" + rssdate=$(formatdate $d) + title=$(findtitle $xfic) + keywords=( $(findkeywords $xfic) ) + printf ": %-55s" "$title ($keywords)" + categories=$(mkcategories $keywords) + { printf "\\n" + printf "\\n%s" "$title" + printf "\\n%s" "${websiteurl}/${blogfile}" + printf "\\n%s%s" "$rssdate" + printf "%s" "$categories" + printf "\\n" "$(getcontent "$xfic")" + printf "\\n\\n\\n" + } >> "$tmpdir/${d}-$(basename $fic).rss" + dates=( $d $dates ) + echo " [${fg[green]}OK${reset_color}]" +done +echo "Publishing" +for fic in $(ls $tmpdir/*.rss | sort -r | head -n $maxarticles ); do + echo "${fic:t}" + cat $fic >> $tmpdir/rss +done + +rssmaxdate=$(formatdate $(for d in $dates; do echo $d; done | sort -r | head -n 1)) +rssbuilddate=$(formatdate $(date)) +{ +cat < + + ${rsstitle} + + ${websiteurl} + + ${rsslang} + ${rssmaxdate} + $rssbuilddate + mkrss.sh + ${rssauthor} + + ${rssimgurl} + ${rssimgtitle} + ${websiteurl} + +END +cat $tmpdir/rss +cat < + +END +} > "$rssfile" + +rm -rf $tmpdir +echo "RSS Generated" +#+end_src + +The =nix-shell= bang pattern is a neat trick to have all the dependencies I +need when running my script, I could have added zsh, but my main concern +was about =html-xml-utils=. + +Along my script I have a =shell.nix= file containing: + +#+begin_src nix +{ pkgs ? import (fetchTarball https://github.com/NixOS/nixpkgs/archive/19.09-beta.tar.gz) {} }: + pkgs.mkShell { + buildInputs = [ pkgs.html-xml-utils ]; + } +#+end_src + +[fn:lb] https://github.com/LukeSmithxyz/lb