Skip to content

Commit

Permalink
wsd: move anonymization implementation to own home
Browse files Browse the repository at this point in the history
This is non-functional refactoring. It will
be followed by some improvements.

Change-Id: Icada6372e684e3ca8fcac7d2f8350ec2804d23da
Signed-off-by: Ashod Nakashian <ashod.nakashian@collabora.co.uk>
  • Loading branch information
Ashod committed Nov 20, 2024
1 parent 12d6409 commit fc2c441
Show file tree
Hide file tree
Showing 17 changed files with 197 additions and 158 deletions.
3 changes: 2 additions & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,8 @@ wsd_headers = wsd/Admin.hpp \
wsd/wopi/WopiProxy.hpp \
wsd/wopi/WopiStorage.hpp

shared_headers = common/Common.hpp \
shared_headers = common/Anonymizer.hpp \
common/Common.hpp \
common/CharacterConverter.hpp \
common/Clipboard.hpp \
common/Crypto.hpp \
Expand Down
101 changes: 101 additions & 0 deletions common/Anonymizer.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */
/*
* Copyright the Collabora Online contributors.
*
* SPDX-License-Identifier: MPL-2.0
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/

#pragma once

#include <common/Log.hpp>
#include <common/Util.hpp>

#include <atomic>
#include <mutex>
#include <string>
#include <unordered_map>

extern std::unordered_map<std::string, std::string> AnonymizedStrings;
extern std::mutex AnonymizedMutex;

/// Responsible for annonymizing names and URLs.
/// The anonymized version is always the same for
/// a given value, provided the salt is identical.
class Anonymizer
{
public:
/// Sets the anonymized version of a given plain-text string.
/// After this, 'anonymize(plain)' will return 'anonymized'.
static void mapAnonymized(const std::string& plain, const std::string& anonymized)
{
if (plain.empty() || anonymized.empty())
return;

if (plain != anonymized)
LOG_TRC("Anonymizing [" << plain << "] -> [" << anonymized << "].");

std::unique_lock<std::mutex> lock(AnonymizedMutex);

AnonymizedStrings[plain] = anonymized;
}

/// Anonymize a sensitive string to avoid leaking it.
/// Called on strings to be logged or exposed.
static std::string anonymize(const std::string& text, const std::uint64_t nAnonymizationSalt)
{
{
std::unique_lock<std::mutex> lock(AnonymizedMutex);

const auto it = AnonymizedStrings.find(text);
if (it != AnonymizedStrings.end())
{
if (text != it->second)
LOG_TRC("Found anonymized [" << text << "] -> [" << it->second << "].");
return it->second;
}
}

// Modified 64-bit FNV-1a to add salting.
// For the algorithm and the magic numbers, see http://isthe.com/chongo/tech/comp/fnv/
std::uint64_t hash = 0xCBF29CE484222325LL;
hash ^= nAnonymizationSalt;
hash *= 0x100000001b3ULL;
for (const char c : text)
{
hash ^= static_cast<std::uint64_t>(c);
hash *= 0x100000001b3ULL;
}

hash ^= nAnonymizationSalt;
hash *= 0x100000001b3ULL;

// Generate the anonymized string. The '#' is to hint that it's anonymized.
// Prepend with count to make it unique within a single process instance,
// in case we get collisions (which we will, eventually). N.B.: Identical
// strings likely to have different prefixes when logged in WSD process vs. Kit.
static std::atomic<unsigned> AnonymizationCounter(0);
std::string res =
'#' + Util::encodeId(AnonymizationCounter++, 0) + '#' + Util::encodeId(hash, 0) + '#';
mapAnonymized(text, res);
return res;
}

/// Clears the shared state of mapAnonymized() / anonymize().
static void clearAnonymized() { AnonymizedStrings.clear(); }

/// Anonymize the basename of filenames only, preserving the path and extension.
static std::string anonymizeUrl(const std::string& url, const std::uint64_t nAnonymizationSalt)
{
std::string base;
std::string filename;
std::string ext;
std::string params;
std::tie(base, filename, ext, params) = Util::splitUrl(url);

return base + anonymize(filename, nAnonymizationSalt) + ext + params;
}
};
13 changes: 7 additions & 6 deletions common/FileUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@

#include "FileUtil.hpp"

#include <common/Anonymizer.hpp>
#include <common/Log.hpp>
#include <common/Unit.hpp>
#include <common/Util.hpp>

#include <dirent.h>
#include <exception>
#include <ftw.h>
Expand Down Expand Up @@ -43,10 +48,6 @@
#include <Poco/File.h>
#include <Poco/Path.h>

#include "Log.hpp"
#include "Util.hpp"
#include "Unit.hpp"

namespace FileUtil
{
std::string createRandomDir(const std::string& path)
Expand Down Expand Up @@ -621,14 +622,14 @@ namespace FileUtil
/// Anonymize the basename of filenames, preserving the path and extension.
std::string anonymizeUrl(const std::string& url)
{
return AnonymizeUserData ? Util::anonymizeUrl(url, AnonymizationSalt) : url;
return AnonymizeUserData ? Anonymizer::anonymizeUrl(url, AnonymizationSalt) : url;
}

/// Anonymize user names and IDs.
/// Will use the Obfuscated User ID if one is provided via WOPI.
std::string anonymizeUsername(const std::string& username)
{
return AnonymizeUserData ? Util::anonymize(username, AnonymizationSalt) : username;
return AnonymizeUserData ? Anonymizer::anonymize(username, AnonymizationSalt) : username;
}

std::string extractFileExtension(const std::string& path)
Expand Down
17 changes: 9 additions & 8 deletions common/Session.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,17 @@

#include "Session.hpp"

#include <common/Anonymizer.hpp>
#include <common/Log.hpp>
#include <common/Protocol.hpp>
#include <common/Uri.hpp>
#include <common/Util.hpp>

#include <Poco/Exception.h>
#include <Poco/Path.h>
#include <Poco/String.h>
#include <Poco/URI.h>

#include <common/Uri.hpp>
#include "Protocol.hpp"
#include "Log.hpp"
#include "Util.hpp"

using namespace COOLProtocol;

using Poco::Exception;
Expand Down Expand Up @@ -240,9 +241,9 @@ void Session::parseDocOptions(const StringVector& tokens, int& part, std::string
}
}

Util::mapAnonymized(_userId, _userIdAnonym);
Util::mapAnonymized(_userName, _userNameAnonym);
Util::mapAnonymized(_jailedFilePath, _jailedFilePathAnonym);
Anonymizer::mapAnonymized(_userId, _userIdAnonym);
Anonymizer::mapAnonymized(_userName, _userNameAnonym);
Anonymizer::mapAnonymized(_jailedFilePath, _jailedFilePathAnonym);

if (tokens.size() > offset)
{
Expand Down
74 changes: 3 additions & 71 deletions common/Util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@
#include "Protocol.hpp"
#include "TraceEvent.hpp"

std::unordered_map<std::string, std::string> AnonymizedStrings;
std::mutex AnonymizedMutex;

namespace Util
{
namespace rng
Expand Down Expand Up @@ -481,77 +484,6 @@ namespace Util
return std::make_tuple(base, filename, ext, params);
}

static std::unordered_map<std::string, std::string> AnonymizedStrings;
static std::atomic<unsigned> AnonymizationCounter(0);
static std::mutex AnonymizedMutex;

void mapAnonymized(const std::string& plain, const std::string& anonymized)
{
if (plain.empty() || anonymized.empty())
return;

if (Log::traceEnabled() && plain != anonymized)
LOG_TRC("Anonymizing [" << plain << "] -> [" << anonymized << "].");

std::unique_lock<std::mutex> lock(AnonymizedMutex);

AnonymizedStrings[plain] = anonymized;
}

std::string anonymize(const std::string& text, const std::uint64_t nAnonymizationSalt)
{
{
std::unique_lock<std::mutex> lock(AnonymizedMutex);

const auto it = AnonymizedStrings.find(text);
if (it != AnonymizedStrings.end())
{
if (Log::traceEnabled() && text != it->second)
LOG_TRC("Found anonymized [" << text << "] -> [" << it->second << "].");
return it->second;
}
}

// Modified 64-bit FNV-1a to add salting.
// For the algorithm and the magic numbers, see http://isthe.com/chongo/tech/comp/fnv/
std::uint64_t hash = 0xCBF29CE484222325LL;
hash ^= nAnonymizationSalt;
hash *= 0x100000001b3ULL;
for (const char c : text)
{
hash ^= static_cast<std::uint64_t>(c);
hash *= 0x100000001b3ULL;
}

hash ^= nAnonymizationSalt;
hash *= 0x100000001b3ULL;

// Generate the anonymized string. The '#' is to hint that it's anonymized.
// Prepend with count to make it unique within a single process instance,
// in case we get collisions (which we will, eventually). N.B.: Identical
// strings likely to have different prefixes when logged in WSD process vs. Kit.
std::string res
= '#' + Util::encodeId(AnonymizationCounter++, 0) + '#' + Util::encodeId(hash, 0) + '#';
mapAnonymized(text, res);
return res;
}

void clearAnonymized()
{
AnonymizedStrings.clear();
}

std::string anonymizeUrl(const std::string& url, const std::uint64_t nAnonymizationSalt)
{
std::string base;
std::string filename;
std::string ext;
std::string params;
std::tie(base, filename, ext, params) = Util::splitUrl(url);

return base + Util::anonymize(filename, nAnonymizationSalt) + ext + params;
}

std::string getTimeNow(const char* format)
{
char time_now[64];
Expand Down
14 changes: 0 additions & 14 deletions common/Util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1163,20 +1163,6 @@ int main(int argc, char**argv)
/// either for a URL or for a file path
std::string cleanupFilename(const std::string &filename);

/// Anonymize a sensitive string to avoid leaking it.
/// Called on strings to be logged or exposed.
std::string anonymize(const std::string& text, const std::uint64_t nAnonymizationSalt);

/// Sets the anonymized version of a given plain-text string.
/// After this, 'anonymize(plain)' will return 'anonymized'.
void mapAnonymized(const std::string& plain, const std::string& anonymized);

/// Clears the shared state of mapAnonymized() / anonymize().
void clearAnonymized();

/// Anonymize the basename of filenames only, preserving the path and extension.
std::string anonymizeUrl(const std::string& url, const std::uint64_t nAnonymizationSalt);

/// Return true if the subject matches in given set. It uses regex
/// Mainly used to match WOPI hosts patterns
bool matchRegex(const std::set<std::string>& set, const std::string& subject);
Expand Down
14 changes: 6 additions & 8 deletions kit/ChildSession.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,17 @@

#include <config.h>

#include "Kit.hpp"
#include "ChildSession.hpp"
#include "MobileApp.hpp"
#include "COOLWSD.hpp"

#include <common/Anonymizer.hpp>
#include <common/Log.hpp>
#include <common/Unit.hpp>
#include <common/Util.hpp>

#include <climits>
#include <fstream>
#include <memory>
#include <sstream>
#include <regex>

#define LOK_USE_UNSTABLE_API
#include <LibreOfficeKit/LibreOfficeKitEnums.h>
Expand Down Expand Up @@ -49,10 +50,7 @@
#include <common/SpookyV2.h>
#include <common/Uri.hpp>
#include "KitHelper.hpp"
#include <Log.hpp>
#include <Png.hpp>
#include <Util.hpp>
#include <Unit.hpp>
#include <Clipboard.hpp>
#include <string>
#include <CommandControl.hpp>
Expand Down Expand Up @@ -1281,7 +1279,7 @@ bool ChildSession::downloadAs(const StringVector& tokens)
}

// Obfuscate the new name.
Util::mapAnonymized(Uri::getFilenameFromURL(name), _docManager->getObfuscatedFileId());
Anonymizer::mapAnonymized(Uri::getFilenameFromURL(name), _docManager->getObfuscatedFileId());

getTokenString(tokens[3], "format", format);

Expand Down
11 changes: 6 additions & 5 deletions kit/Kit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

#include <config.h>

#include <common/Anonymizer.hpp>

#include <csignal>
#include <dlfcn.h>
#include <limits>
Expand Down Expand Up @@ -1972,16 +1974,15 @@ std::shared_ptr<lok::Document> Document::load(const std::shared_ptr<ChildSession
_isDocPasswordProtected = false;

const char *pURL = uri.c_str();
LOG_DBG("Calling lokit::documentLoad(" << FileUtil::anonymizeUrl(pURL) << ", \"" << options << "\").");
LOG_DBG("Calling lokit::documentLoad(" << anonymizeUrl(pURL) << ", \"" << options << "\")");
const auto start = std::chrono::steady_clock::now();
_loKitDocument.reset(_loKit->documentLoad(pURL, options.c_str()));
#ifdef __ANDROID__
_loKitDocumentForAndroidOnly = _loKitDocument;
#endif
const auto duration = std::chrono::steady_clock::now() - start;
const auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(duration);
LOG_DBG("Returned lokit::documentLoad(" << FileUtil::anonymizeUrl(pURL) << ") in "
<< elapsed);
LOG_DBG("Returned lokit::documentLoad(" << anonymizeUrl(pURL) << ") in " << elapsed);
#ifdef IOS
DocumentData::get(_mobileAppDocId).loKitDocument = _loKitDocument.get();
#endif
Expand Down Expand Up @@ -3815,7 +3816,7 @@ TileWireId getCurrentWireId(bool increment)
std::string anonymizeUrl(const std::string& url)
{
#ifndef BUILDING_TESTS
return AnonymizeUserData ? Util::anonymizeUrl(url, AnonymizationSalt) : url;
return AnonymizeUserData ? Anonymizer::anonymizeUrl(url, AnonymizationSalt) : url;
#else
return url;
#endif
Expand Down Expand Up @@ -3967,7 +3968,7 @@ bool globalPreinit(const std::string &loTemplate)
std::string anonymizeUsername(const std::string& username)
{
#ifndef BUILDING_TESTS
return AnonymizeUserData ? Util::anonymize(username, AnonymizationSalt) : username;
return AnonymizeUserData ? Anonymizer::anonymize(username, AnonymizationSalt) : username;
#else
return username;
#endif
Expand Down
Loading

0 comments on commit fc2c441

Please sign in to comment.