[PATCH v6] gitweb: redacted e-mail addresses feature.
To
git@vger.kernel.org
Cc
Ævar Arnfjörð Bjarmason
brian m. carlson
Georgios Kontaxis
From
Georgios Kontaxis via GitGitGadget
See Also
Prev
Date
2021-03-28 23:26:03 UTC
From: Georgios Kontaxis <geko1702+commits@99rst.org>

Gitweb extracts content from the Git log and makes it accessible
over HTTP. As a result, e-mail addresses found in commits are
exposed to web crawlers and they may not respect robots.txt.
This can result in unsolicited messages.

Introduce an 'email-privacy' feature which redacts e-mail addresses
from the generated HTML content. Specifically, obscure addresses
retrieved from the the author/committer and comment sections of the
Git log. The feature is off by default.

This feature does not prevent someone from downloading the
unredacted commit log, e.g., by cloning the repository, and
extracting information from it. It aims to hinder the low-
effort, bulk collection of e-mail addresses by web crawlers.

Signed-off-by: Georgios Kontaxis <geko1702+commits@99rst.org>
---
    gitweb: redacted e-mail addresses feature.
    
    Changes since v1:
    
     * Turned off the feature by default.
     * Removed duplicate code.
     * Added note about Gitweb consumers receiving redacted logs.
    
    Changes since v2:
    
     * The feature can be set on a per-project basis. ('override' => 1)
    
    Changes since v3:
    
     * Renamed feature to "email-privacy" and improved documentation.
     * Removed UI elements for git-format-patch since it won't be redacted.
     * Simplified calls to the address redaction logic.
     * Mail::Address is now used to reduce false-positive redactions.
    
    Changes since v4:
    
     * Rephrased the commit comment.
     * hide_mailaddrs_if_private is slighly more compact.
    
    Changes since v5:
    
     * A simple <local@domain> filter is used instead of Mail::Address to
       identify addresses.
    
    Signed-off-by: Georgios Kontaxis geko1702+commits@99rst.org

Published-As: https://github.com/gitgitgadget/git/releases/tag/pr-910%2Fkontaxis%2Fkontaxis%2Femail_privacy-v6
Fetch-It-Via: git fetch https://github.com/gitgitgadget/git pr-910/kontaxis/kontaxis/email_privacy-v6
Pull-Request: https://github.com/gitgitgadget/git/pull/910

Range-diff vs v5:

 1:  1427231f9db5 ! 1:  245cfed8ad58 gitweb: redacted e-mail addresses feature.
     @@ Documentation/gitweb.conf.txt: default font sizes or lineheights are changed (e.
       	`$highlight_bin` program to be available (see the description of
      
       ## gitweb/gitweb.perl ##
     -@@
     - use File::Basename qw(basename);
     - use Time::HiRes qw(gettimeofday tv_interval);
     - use Digest::MD5 qw(md5_hex);
     -+use Git::LoadCPAN::Mail::Address;
     - 
     - binmode STDOUT, ':utf8';
     - 
      @@ gitweb/gitweb.perl: sub evaluate_uri {
       		'sub' => \&feature_extra_branch_refs,
       		'override' => 0,
     @@ gitweb/gitweb.perl: sub parse_date {
       	return %date;
       }
       
     -+sub is_mailaddr {
     -+	my @addrs = Mail::Address->parse(shift);
     -+	if (!@addrs || !$addrs[0]->host || !$addrs[0]->user) {
     -+		return 0;
     -+	}
     -+	return 1;
     -+}
     -+
      +sub hide_mailaddrs_if_private {
      +	my $line = shift;
      +	return $line unless gitweb_check_feature('email-privacy');
     -+	while ($line =~ m/(<[^>]+>)/g) {
     -+		my $match = $1;
     -+		if (!is_mailaddr($match)) {
     -+			next;
     -+		}
     -+		my $match_offset = pos($line) - length($match);
     -+		pos $line = $match_offset;
     -+
     -+		my $redaction = "<redacted>";
     -+		$line =~ s/\G(<[^>]+>)/$redaction/;
     -+
     -+		pos $line = $match_offset + length($redaction);
     -+	}
     ++	$line =~ s/<[^@>]+@[^>]+>/<redacted>/ig;
      +	return $line;
      +}
      +
     @@ gitweb/gitweb.perl: sub git_commitdiff {
       			$formats_nav .= " | " .
       				$cgi->a({-href => href(action=>"patch", -replay=>1)},
       					"patch");
     -
     - ## t/lib-gitweb.sh ##
     -@@ t/lib-gitweb.sh: gitweb_run () {
     - 	GITWEB_CONFIG=$(pwd)/gitweb_config.perl
     - 	export GITWEB_CONFIG
     - 
     -+	PERL5LIB="$GIT_BUILD_DIR/perl:$GIT_BUILD_DIR/perl/FromCPAN"
     -+	export PERL5LIB
     -+
     - 	# some of git commands write to STDERR on error, but this is not
     - 	# written to web server logs, so we are not interested in that:
     - 	# we are interested only in properly formatted errors/warnings


 Documentation/gitweb.conf.txt | 11 +++++++++++
 gitweb/gitweb.perl            | 34 +++++++++++++++++++++++++++-------
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/Documentation/gitweb.conf.txt b/Documentation/gitweb.conf.txt
index 7963a79ba98b..34b1d6e22435 100644
--- a/Documentation/gitweb.conf.txt
+++ b/Documentation/gitweb.conf.txt
@@ -751,6 +751,17 @@ default font sizes or lineheights are changed (e.g. via adding extra
 CSS stylesheet in `@stylesheets`), it may be appropriate to change
 these values.
 
+email-privacy::
+	Redact e-mail addresses from the generated HTML, etc. content.
+	This obscures e-mail addresses retrieved from the author/committer
+	and comment sections of the Git log.
+	It is meant to hinder web crawlers that harvest and abuse addresses.
+	Such crawlers may not respect robots.txt.
+	Note that users and user tools also see the addresses as redacted.
+	If Gitweb is not the final step in a workflow then subsequent steps
+	may misbehave because of the redacted information they receive.
+	Disabled by default.
+
 highlight::
 	Server-side syntax highlight support in "blob" view.  It requires
 	`$highlight_bin` program to be available (see the description of
diff --git a/gitweb/gitweb.perl b/gitweb/gitweb.perl
index 0959a782eccb..01c6faf88006 100755
--- a/gitweb/gitweb.perl
+++ b/gitweb/gitweb.perl
@@ -569,6 +569,15 @@ sub evaluate_uri {
 		'sub' => \&feature_extra_branch_refs,
 		'override' => 0,
 		'default' => []},
+
+	# Redact e-mail addresses.
+
+	# To enable system wide have in $GITWEB_CONFIG
+	# $feature{'email-privacy'}{'default'} = [1];
+	'email-privacy' => {
+		'sub' => sub { feature_bool('email-privacy', @_) },
+		'override' => 1,
+		'default' => [0]},
 );
 
 sub gitweb_get_feature {
@@ -3449,6 +3458,13 @@ sub parse_date {
 	return %date;
 }
 
+sub hide_mailaddrs_if_private {
+	my $line = shift;
+	return $line unless gitweb_check_feature('email-privacy');
+	$line =~ s/<[^@>]+@[^>]+>/<redacted>/ig;
+	return $line;
+}
+
 sub parse_tag {
 	my $tag_id = shift;
 	my %tag;
@@ -3465,7 +3481,7 @@ sub parse_tag {
 		} elsif ($line =~ m/^tag (.+)$/) {
 			$tag{'name'} = $1;
 		} elsif ($line =~ m/^tagger (.*) ([0-9]+) (.*)$/) {
-			$tag{'author'} = $1;
+			$tag{'author'} = hide_mailaddrs_if_private($1);
 			$tag{'author_epoch'} = $2;
 			$tag{'author_tz'} = $3;
 			if ($tag{'author'} =~ m/^([^<]+) <([^>]*)>/) {
@@ -3513,7 +3529,7 @@ sub parse_commit_text {
 		} elsif ((!defined $withparents) && ($line =~ m/^parent ($oid_regex)$/)) {
 			push @parents, $1;
 		} elsif ($line =~ m/^author (.*) ([0-9]+) (.*)$/) {
-			$co{'author'} = to_utf8($1);
+			$co{'author'} = hide_mailaddrs_if_private(to_utf8($1));
 			$co{'author_epoch'} = $2;
 			$co{'author_tz'} = $3;
 			if ($co{'author'} =~ m/^([^<]+) <([^>]*)>/) {
@@ -3523,7 +3539,7 @@ sub parse_commit_text {
 				$co{'author_name'} = $co{'author'};
 			}
 		} elsif ($line =~ m/^committer (.*) ([0-9]+) (.*)$/) {
-			$co{'committer'} = to_utf8($1);
+			$co{'committer'} = hide_mailaddrs_if_private(to_utf8($1));
 			$co{'committer_epoch'} = $2;
 			$co{'committer_tz'} = $3;
 			if ($co{'committer'} =~ m/^([^<]+) <([^>]*)>/) {
@@ -3568,9 +3584,10 @@ sub parse_commit_text {
 	if (! defined $co{'title'} || $co{'title'} eq "") {
 		$co{'title'} = $co{'title_short'} = '(no commit message)';
 	}
-	# remove added spaces
+	# remove added spaces, redact e-mail addresses if applicable.
 	foreach my $line (@commit_lines) {
 		$line =~ s/^    //;
+		$line = hide_mailaddrs_if_private($line);
 	}
 	$co{'comment'} = \@commit_lines;
 
@@ -7489,7 +7506,8 @@ sub git_log_generic {
 			         -accesskey => "n", -title => "Alt-n"}, "next");
 	}
 	my $patch_max = gitweb_get_feature('patches');
-	if ($patch_max && !defined $file_name) {
+	if ($patch_max && !defined $file_name &&
+		!gitweb_check_feature('email-privacy')) {
 		if ($patch_max < 0 || @commitlist <= $patch_max) {
 			$paging_nav .= " &sdot; " .
 				$cgi->a({-href => href(action=>"patches", -replay=>1)},
@@ -7550,7 +7568,8 @@ sub git_commit {
 			} @$parents ) .
 			')';
 	}
-	if (gitweb_check_feature('patches') && @$parents <= 1) {
+	if (gitweb_check_feature('patches') && @$parents <= 1 &&
+		!gitweb_check_feature('email-privacy')) {
 		$formats_nav .= " | " .
 			$cgi->a({-href => href(action=>"patch", -replay=>1)},
 				"patch");
@@ -7863,7 +7882,8 @@ sub git_commitdiff {
 		$formats_nav =
 			$cgi->a({-href => href(action=>"commitdiff_plain", -replay=>1)},
 			        "raw");
-		if ($patch_max && @{$co{'parents'}} <= 1) {
+		if ($patch_max && @{$co{'parents'}} <= 1 &&
+			!gitweb_check_feature('email-privacy')) {
 			$formats_nav .= " | " .
 				$cgi->a({-href => href(action=>"patch", -replay=>1)},
 					"patch");

base-commit: a5828ae6b52137b913b978e16cd2334482eb4c1f
-- 
gitgitgadget