Re: [PATCH v6] gitweb: redacted e-mail addresses feature.
To
Georgios Kontaxis via GitGitGadget
Cc
git@vger.kernel.org
brian m. carlson
Georgios Kontaxis
From
Ævar Arnfjörð Bjarmason
See Also
Prev Ref 1
Date
2021-04-08 22:43:19 UTC

On Mon, Mar 29 2021, Georgios Kontaxis via GitGitGadget wrote:

> [...]
> +email-privacy::
> +	Redact e-mail addresses from the generated HTML, etc. content.
> +	This obscures e-mail addresses retrieved from the author/committer
> +	and comment sections of the Git log.
> +	It is meant to hinder web crawlers that harvest and abuse addresses.
> +	Such crawlers may not respect robots.txt.
> +	Note that users and user tools also see the addresses as redacted.
> +	If Gitweb is not the final step in a workflow then subsequent steps
> +	may misbehave because of the redacted information they receive.
> +	Disabled by default.
> +
>  highlight::
>  	Server-side syntax highlight support in "blob" view.  It requires
>  	`$highlight_bin` program to be available (see the description of
> diff --git a/gitweb/gitweb.perl b/gitweb/gitweb.perl
> index 0959a782eccb..01c6faf88006 100755
> --- a/gitweb/gitweb.perl
> +++ b/gitweb/gitweb.perl
> @@ -569,6 +569,15 @@ sub evaluate_uri {
>  		'sub' => \&feature_extra_branch_refs,
>  		'override' => 0,
>  		'default' => []},
> +
> +	# Redact e-mail addresses.
> +
> +	# To enable system wide have in $GITWEB_CONFIG
> +	# $feature{'email-privacy'}{'default'} = [1];
> +	'email-privacy' => {
> +		'sub' => sub { feature_bool('email-privacy', @_) },
> +		'override' => 1,
> +		'default' => [0]},
>  );
>  
>  sub gitweb_get_feature {
> @@ -3449,6 +3458,13 @@ sub parse_date {
>  	return %date;
>  }
>  
> +sub hide_mailaddrs_if_private {
> +	my $line = shift;
> +	return $line unless gitweb_check_feature('email-privacy');
> +	$line =~ s/<[^@>]+@[^>]+>/<redacted>/ig;

The /i here is redundant, since you have nothing that'll case-fold on
the LHS of the s///, doesn't harm anything either. Just a small note
since it's new in v6...

> +	return $line;
> +}
> +
>  sub parse_tag {
>  	my $tag_id = shift;
>  	my %tag;
> @@ -3465,7 +3481,7 @@ sub parse_tag {
>  		} elsif ($line =~ m/^tag (.+)$/) {
>  			$tag{'name'} = $1;
>  		} elsif ($line =~ m/^tagger (.*) ([0-9]+) (.*)$/) {
> -			$tag{'author'} = $1;
> +			$tag{'author'} = hide_mailaddrs_if_private($1);
>  			$tag{'author_epoch'} = $2;
>  			$tag{'author_tz'} = $3;
>  			if ($tag{'author'} =~ m/^([^<]+) <([^>]*)>/) {
> @@ -3513,7 +3529,7 @@ sub parse_commit_text {
>  		} elsif ((!defined $withparents) && ($line =~ m/^parent ($oid_regex)$/)) {
>  			push @parents, $1;
>  		} elsif ($line =~ m/^author (.*) ([0-9]+) (.*)$/) {
> -			$co{'author'} = to_utf8($1);
> +			$co{'author'} = hide_mailaddrs_if_private(to_utf8($1));
>  			$co{'author_epoch'} = $2;
>  			$co{'author_tz'} = $3;
>  			if ($co{'author'} =~ m/^([^<]+) <([^>]*)>/) {
> @@ -3523,7 +3539,7 @@ sub parse_commit_text {
>  				$co{'author_name'} = $co{'author'};
>  			}
>  		} elsif ($line =~ m/^committer (.*) ([0-9]+) (.*)$/) {
> -			$co{'committer'} = to_utf8($1);
> +			$co{'committer'} = hide_mailaddrs_if_private(to_utf8($1));
>  			$co{'committer_epoch'} = $2;
>  			$co{'committer_tz'} = $3;
>  			if ($co{'committer'} =~ m/^([^<]+) <([^>]*)>/) {
> @@ -3568,9 +3584,10 @@ sub parse_commit_text {
>  	if (! defined $co{'title'} || $co{'title'} eq "") {
>  		$co{'title'} = $co{'title_short'} = '(no commit message)';
>  	}
> -	# remove added spaces
> +	# remove added spaces, redact e-mail addresses if applicable.
>  	foreach my $line (@commit_lines) {
>  		$line =~ s/^    //;
> +		$line = hide_mailaddrs_if_private($line);
>  	}
>  	$co{'comment'} = \@commit_lines;
>  
> @@ -7489,7 +7506,8 @@ sub git_log_generic {
>  			         -accesskey => "n", -title => "Alt-n"}, "next");
>  	}
>  	my $patch_max = gitweb_get_feature('patches');
> -	if ($patch_max && !defined $file_name) {
> +	if ($patch_max && !defined $file_name &&
> +		!gitweb_check_feature('email-privacy')) {
>  		if ($patch_max < 0 || @commitlist <= $patch_max) {
>  			$paging_nav .= " &sdot; " .
>  				$cgi->a({-href => href(action=>"patches", -replay=>1)},
> @@ -7550,7 +7568,8 @@ sub git_commit {
>  			} @$parents ) .
>  			')';
>  	}
> -	if (gitweb_check_feature('patches') && @$parents <= 1) {
> +	if (gitweb_check_feature('patches') && @$parents <= 1 &&
> +		!gitweb_check_feature('email-privacy')) {
>  		$formats_nav .= " | " .
>  			$cgi->a({-href => href(action=>"patch", -replay=>1)},
>  				"patch");
> @@ -7863,7 +7882,8 @@ sub git_commitdiff {
>  		$formats_nav =
>  			$cgi->a({-href => href(action=>"commitdiff_plain", -replay=>1)},
>  			        "raw");
> -		if ($patch_max && @{$co{'parents'}} <= 1) {
> +		if ($patch_max && @{$co{'parents'}} <= 1 &&
> +			!gitweb_check_feature('email-privacy')) {
>  			$formats_nav .= " | " .
>  				$cgi->a({-href => href(action=>"patch", -replay=>1)},
>  					"patch");

I didn't run this, and hadn't kept up for a few rounds. I'm happy to see
the pos/while etc. looping gone, this LGTM.