diff options
Diffstat (limited to 'lib/esan/working_set.cpp')
-rw-r--r-- | lib/esan/working_set.cpp | 280 |
1 files changed, 0 insertions, 280 deletions
diff --git a/lib/esan/working_set.cpp b/lib/esan/working_set.cpp deleted file mode 100644 index e56902c8f..000000000 --- a/lib/esan/working_set.cpp +++ /dev/null @@ -1,280 +0,0 @@ -//===-- working_set.cpp ---------------------------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file is a part of EfficiencySanitizer, a family of performance tuners. -// -// This file contains working-set-specific code. -//===----------------------------------------------------------------------===// - -#include "working_set.h" -#include "esan.h" -#include "esan_circular_buffer.h" -#include "esan_flags.h" -#include "esan_shadow.h" -#include "esan_sideline.h" -#include "sanitizer_common/sanitizer_procmaps.h" - -// We shadow every cache line of app memory with one shadow byte. -// - The highest bit of each shadow byte indicates whether the corresponding -// cache line has ever been accessed. -// - The lowest bit of each shadow byte indicates whether the corresponding -// cache line was accessed since the last sample. -// - The other bits are used for working set snapshots at successively -// lower frequencies, each bit to the left from the lowest bit stepping -// down the frequency by 2 to the power of getFlags()->snapshot_step. -// Thus we have something like this: -// Bit 0: Since last sample -// Bit 1: Since last 2^2 samples -// Bit 2: Since last 2^4 samples -// Bit 3: ... -// Bit 7: Ever accessed. -// We live with races in accessing each shadow byte. -typedef unsigned char byte; - -namespace __esan { - -// Our shadow memory assumes that the line size is 64. -static const u32 CacheLineSize = 64; - -// See the shadow byte layout description above. -static const u32 TotalWorkingSetBitIdx = 7; -// We accumulate to the left until we hit this bit. -// We don't need to accumulate to the final bit as it's set on each ref -// by the compiler instrumentation. -static const u32 MaxAccumBitIdx = 6; -static const u32 CurWorkingSetBitIdx = 0; -static const byte ShadowAccessedVal = - (1 << TotalWorkingSetBitIdx) | (1 << CurWorkingSetBitIdx); - -static SidelineThread Thread; -// If we use real-time-based timer samples this won't overflow in any realistic -// scenario, but if we switch to some other unit (such as memory accesses) we -// may want to consider a 64-bit int. -static u32 SnapshotNum; - -// We store the wset size for each of 8 different sampling frequencies. -static const u32 NumFreq = 8; // One for each bit of our shadow bytes. -// We cannot use static objects as the global destructor is called -// prior to our finalize routine. -// These are each circular buffers, sized up front. -CircularBuffer<u32> SizePerFreq[NumFreq]; -// We cannot rely on static initializers (they may run too late) but -// we record the size here for clarity: -u32 CircularBufferSizes[NumFreq] = { - // These are each mmap-ed so our minimum is one page. - 32*1024, - 16*1024, - 8*1024, - 4*1024, - 4*1024, - 4*1024, - 4*1024, - 4*1024, -}; - -void processRangeAccessWorkingSet(uptr PC, uptr Addr, SIZE_T Size, - bool IsWrite) { - if (Size == 0) - return; - SIZE_T I = 0; - uptr LineSize = getFlags()->cache_line_size; - // As Addr+Size could overflow at the top of a 32-bit address space, - // we avoid the simpler formula that rounds the start and end. - SIZE_T NumLines = Size / LineSize + - // Add any extra at the start or end adding on an extra line: - (LineSize - 1 + Addr % LineSize + Size % LineSize) / LineSize; - byte *Shadow = (byte *)appToShadow(Addr); - // Write shadow bytes until we're word-aligned. - while (I < NumLines && (uptr)Shadow % 4 != 0) { - if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal) - *Shadow |= ShadowAccessedVal; - ++Shadow; - ++I; - } - // Write whole shadow words at a time. - // Using a word-stride loop improves the runtime of a microbenchmark of - // memset calls by 10%. - u32 WordValue = ShadowAccessedVal | ShadowAccessedVal << 8 | - ShadowAccessedVal << 16 | ShadowAccessedVal << 24; - while (I + 4 <= NumLines) { - if ((*(u32*)Shadow & WordValue) != WordValue) - *(u32*)Shadow |= WordValue; - Shadow += 4; - I += 4; - } - // Write any trailing shadow bytes. - while (I < NumLines) { - if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal) - *Shadow |= ShadowAccessedVal; - ++Shadow; - ++I; - } -} - -// This routine will word-align ShadowStart and ShadowEnd prior to scanning. -// It does *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit -// measures the access during the entire execution and should never be cleared. -static u32 countAndClearShadowValues(u32 BitIdx, uptr ShadowStart, - uptr ShadowEnd) { - u32 WorkingSetSize = 0; - u32 ByteValue = 0x1 << BitIdx; - u32 WordValue = ByteValue | ByteValue << 8 | ByteValue << 16 | - ByteValue << 24; - // Get word aligned start. - ShadowStart = RoundDownTo(ShadowStart, sizeof(u32)); - bool Accum = getFlags()->record_snapshots && BitIdx < MaxAccumBitIdx; - // Do not clear the bit that measures access during the entire execution. - bool Clear = BitIdx < TotalWorkingSetBitIdx; - for (u32 *Ptr = (u32 *)ShadowStart; Ptr < (u32 *)ShadowEnd; ++Ptr) { - if ((*Ptr & WordValue) != 0) { - byte *BytePtr = (byte *)Ptr; - for (u32 j = 0; j < sizeof(u32); ++j) { - if (BytePtr[j] & ByteValue) { - ++WorkingSetSize; - if (Accum) { - // Accumulate to the lower-frequency bit to the left. - BytePtr[j] |= (ByteValue << 1); - } - } - } - if (Clear) { - // Clear this bit from every shadow byte. - *Ptr &= ~WordValue; - } - } - } - return WorkingSetSize; -} - -// Scan shadow memory to calculate the number of cache lines being accessed, -// i.e., the number of non-zero bits indexed by BitIdx in each shadow byte. -// We also clear the lowest bits (most recent working set snapshot). -// We do *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit -// measures the access during the entire execution and should never be cleared. -static u32 computeWorkingSizeAndReset(u32 BitIdx) { - u32 WorkingSetSize = 0; - MemoryMappingLayout MemIter(true/*cache*/); - MemoryMappedSegment Segment; - while (MemIter.Next(&Segment)) { - VPrintf(4, "%s: considering %p-%p app=%d shadow=%d prot=%u\n", __FUNCTION__, - Segment.start, Segment.end, Segment.protection, - isAppMem(Segment.start), isShadowMem(Segment.start)); - if (isShadowMem(Segment.start) && Segment.IsWritable()) { - VPrintf(3, "%s: walking %p-%p\n", __FUNCTION__, Segment.start, - Segment.end); - WorkingSetSize += - countAndClearShadowValues(BitIdx, Segment.start, Segment.end); - } - } - return WorkingSetSize; -} - -// This is invoked from a signal handler but in a sideline thread doing nothing -// else so it is a little less fragile than a typical signal handler. -static void takeSample(void *Arg) { - u32 BitIdx = CurWorkingSetBitIdx; - u32 Freq = 1; - ++SnapshotNum; // Simpler to skip 0 whose mod matches everything. - while (BitIdx <= MaxAccumBitIdx && (SnapshotNum % Freq) == 0) { - u32 NumLines = computeWorkingSizeAndReset(BitIdx); - VReport(1, "%s: snapshot #%5d bit %d freq %4d: %8u\n", SanitizerToolName, - SnapshotNum, BitIdx, Freq, NumLines); - SizePerFreq[BitIdx].push_back(NumLines); - Freq = Freq << getFlags()->snapshot_step; - BitIdx++; - } -} - -unsigned int getSampleCountWorkingSet() -{ - return SnapshotNum; -} - -// Initialization that must be done before any instrumented code is executed. -void initializeShadowWorkingSet() { - CHECK(getFlags()->cache_line_size == CacheLineSize); - registerMemoryFaultHandler(); -} - -void initializeWorkingSet() { - if (getFlags()->record_snapshots) { - for (u32 i = 0; i < NumFreq; ++i) - SizePerFreq[i].initialize(CircularBufferSizes[i]); - Thread.launchThread(takeSample, nullptr, getFlags()->sample_freq); - } -} - -static u32 getPeriodForPrinting(u32 MilliSec, const char *&Unit) { - if (MilliSec > 600000) { - Unit = "min"; - return MilliSec / 60000; - } else if (MilliSec > 10000) { - Unit = "sec"; - return MilliSec / 1000; - } else { - Unit = "ms"; - return MilliSec; - } -} - -static u32 getSizeForPrinting(u32 NumOfCachelines, const char *&Unit) { - // We need a constant to avoid software divide support: - static const u32 KilobyteCachelines = (0x1 << 10) / CacheLineSize; - static const u32 MegabyteCachelines = KilobyteCachelines << 10; - - if (NumOfCachelines > 10 * MegabyteCachelines) { - Unit = "MB"; - return NumOfCachelines / MegabyteCachelines; - } else if (NumOfCachelines > 10 * KilobyteCachelines) { - Unit = "KB"; - return NumOfCachelines / KilobyteCachelines; - } else { - Unit = "Bytes"; - return NumOfCachelines * CacheLineSize; - } -} - -void reportWorkingSet() { - const char *Unit; - if (getFlags()->record_snapshots) { - u32 Freq = 1; - Report(" Total number of samples: %u\n", SnapshotNum); - for (u32 i = 0; i < NumFreq; ++i) { - u32 Time = getPeriodForPrinting(getFlags()->sample_freq*Freq, Unit); - Report(" Samples array #%d at period %u %s\n", i, Time, Unit); - // FIXME: report whether we wrapped around and thus whether we - // have data on the whole run or just the last N samples. - for (u32 j = 0; j < SizePerFreq[i].size(); ++j) { - u32 Size = getSizeForPrinting(SizePerFreq[i][j], Unit); - Report("#%4d: %8u %s (%9u cache lines)\n", j, Size, Unit, - SizePerFreq[i][j]); - } - Freq = Freq << getFlags()->snapshot_step; - } - } - - // Get the working set size for the entire execution. - u32 NumOfCachelines = computeWorkingSizeAndReset(TotalWorkingSetBitIdx); - u32 Size = getSizeForPrinting(NumOfCachelines, Unit); - Report(" %s: the total working set size: %u %s (%u cache lines)\n", - SanitizerToolName, Size, Unit, NumOfCachelines); -} - -int finalizeWorkingSet() { - if (getFlags()->record_snapshots) - Thread.joinThread(); - reportWorkingSet(); - if (getFlags()->record_snapshots) { - for (u32 i = 0; i < NumFreq; ++i) - SizePerFreq[i].free(); - } - return 0; -} - -} // namespace __esan |