1 files changed, 319 insertions, 212 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 55c1ce3da47f..caafc0c8d443 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -43,6 +43,231 @@ std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
     0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
 
+namespace llvm {
+
+// A temporary struct to spill SGPRs.
+// This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
+// just v_writelane and v_readlane.
+//
+// When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
+// is saved to scratch (or the other way around for loads).
+// For this, a VGPR is required where the needed lanes can be clobbered. The
+// RegScavenger can provide a VGPR where currently active lanes can be
+// clobbered, but we still need to save inactive lanes.
+// The high-level steps are:
+// - Try to scavenge SGPR(s) to save exec
+// - Try to scavenge VGPR
+// - Save needed, all or inactive lanes of a TmpVGPR
+// - Spill/Restore SGPRs using TmpVGPR
+// - Restore TmpVGPR
+//
+// To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
+// cannot scavenge temporary SGPRs to save exec, we use the following code:
+// buffer_store_dword TmpVGPR ; only if active lanes need to be saved
+// s_not exec, exec
+// buffer_store_dword TmpVGPR ; save inactive lanes
+// s_not exec, exec
+struct SGPRSpillBuilder {
+  struct PerVGPRData {
+    unsigned PerVGPR;
+    unsigned NumVGPRs;
+    int64_t VGPRLanes;
+  };
+
+  // The SGPR to save
+  Register SuperReg;
+  MachineBasicBlock::iterator MI;
+  ArrayRef<int16_t> SplitParts;
+  unsigned NumSubRegs;
+  bool IsKill;
+  const DebugLoc &DL;
+
+  /* When spilling to stack */
+  // The SGPRs are written into this VGPR, which is then written to scratch
+  // (or vice versa for loads).
+  Register TmpVGPR = AMDGPU::NoRegister;
+  // Temporary spill slot to save TmpVGPR to.
+  int TmpVGPRIndex = 0;
+  // If TmpVGPR is live before the spill or if it is scavenged.
+  bool TmpVGPRLive = false;
+  // Scavenged SGPR to save EXEC.
+  Register SavedExecReg = AMDGPU::NoRegister;
+  // Stack index to write the SGPRs to.
+  int Index;
+  unsigned EltSize = 4;
+
+  RegScavenger &RS;
+  MachineBasicBlock &MBB;
+  MachineFunction &MF;
+  SIMachineFunctionInfo &MFI;
+  const SIInstrInfo &TII;
+  const SIRegisterInfo &TRI;
+  bool IsWave32;
+  Register ExecReg;
+  unsigned MovOpc;
+  unsigned NotOpc;
+
+  SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII,
+                   bool IsWave32, MachineBasicBlock::iterator MI, int Index,
+                   RegScavenger &RS)
+      : SuperReg(MI->getOperand(0).getReg()), MI(MI),
+        IsKill(MI->getOperand(0).isKill()), DL(MI->getDebugLoc()), Index(Index),
+        RS(RS), MBB(*MI->getParent()), MF(*MBB.getParent()),
+        MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
+        IsWave32(IsWave32) {
+    const TargetRegisterClass *RC = TRI.getPhysRegClass(SuperReg);
+    SplitParts = TRI.getRegSplitParts(RC, EltSize);
+    NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
+
+    if (IsWave32) {
+      ExecReg = AMDGPU::EXEC_LO;
+      MovOpc = AMDGPU::S_MOV_B32;
+      NotOpc = AMDGPU::S_NOT_B32;
+    } else {
+      ExecReg = AMDGPU::EXEC;
+      MovOpc = AMDGPU::S_MOV_B64;
+      NotOpc = AMDGPU::S_NOT_B64;
+    }
+
+    assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
+    assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
+           SuperReg != AMDGPU::EXEC && "exec should never spill");
+  }
+
+  PerVGPRData getPerVGPRData() {
+    PerVGPRData Data;
+    Data.PerVGPR = IsWave32 ? 32 : 64;
+    Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
+    Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
+    return Data;
+  }
+
+  // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
+  // free.
+  // Writes these instructions if an SGPR can be scavenged:
+  // s_mov_b64 s[6:7], exec   ; Save exec
+  // s_mov_b64 exec, 3        ; Wanted lanemask
+  // buffer_store_dword v1    ; Write scavenged VGPR to emergency slot
+  //
+  // Writes these instructions if no SGPR can be scavenged:
+  // buffer_store_dword v0    ; Only if no free VGPR was found
+  // s_not_b64 exec, exec
+  // buffer_store_dword v0    ; Save inactive lanes
+  //                          ; exec stays inverted, it is flipped back in
+  //                          ; restore.
+  void prepare() {
+    // Scavenged temporary VGPR to use. It must be scavenged once for any number
+    // of spilled subregs.
+    // FIXME: The liveness analysis is limited and does not tell if a register
+    // is in use in lanes that are currently inactive. We can never be sure if
+    // a register as actually in use in another lane, so we need to save all
+    // used lanes of the chosen VGPR.
+    TmpVGPR = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0, false);
+
+    // Reserve temporary stack slot
+    TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI);
+    if (TmpVGPR) {
+      // Found a register that is dead in the currently active lanes, we only
+      // need to spill inactive lanes.
+      TmpVGPRLive = false;
+    } else {
+      // Pick v0 because it doesn't make a difference.
+      TmpVGPR = AMDGPU::VGPR0;
+      TmpVGPRLive = true;
+    }
+
+    // Try to scavenge SGPRs to save exec
+    assert(!SavedExecReg && "Exec is already saved, refuse to save again");
+    const TargetRegisterClass &RC =
+        IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
+    RS.setRegUsed(SuperReg);
+    SavedExecReg = RS.scavengeRegister(&RC, MI, 0, false);
+
+    int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
+
+    if (SavedExecReg) {
+      // Set exec to needed lanes
+      BuildMI(MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg);
+      auto I = BuildMI(MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
+      if (!TmpVGPRLive)
+        I.addReg(TmpVGPR, RegState::ImplicitDefine);
+      // Spill needed lanes
+      TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
+    } else {
+      // Spill active lanes
+      if (TmpVGPRLive)
+        TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
+                                    /*IsKill*/ false);
+      // Spill inactive lanes
+      auto I = BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+      if (!TmpVGPRLive)
+        I.addReg(TmpVGPR, RegState::ImplicitDefine);
+      TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
+    }
+  }
+
+  // Writes these instructions if an SGPR can be scavenged:
+  // buffer_load_dword v1     ; Write scavenged VGPR to emergency slot
+  // s_waitcnt vmcnt(0)       ; If a free VGPR was found
+  // s_mov_b64 exec, s[6:7]   ; Save exec
+  //
+  // Writes these instructions if no SGPR can be scavenged:
+  // buffer_load_dword v0     ; Restore inactive lanes
+  // s_waitcnt vmcnt(0)       ; If a free VGPR was found
+  // s_not_b64 exec, exec
+  // buffer_load_dword v0     ; Only if no free VGPR was found
+  void restore() {
+    if (SavedExecReg) {
+      // Restore used lanes
+      TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
+                                  /*IsKill*/ false);
+      // Restore exec
+      auto I = BuildMI(MBB, MI, DL, TII.get(MovOpc), ExecReg)
+                   .addReg(SavedExecReg, RegState::Kill);
+      // Add an implicit use of the load so it is not dead.
+      // FIXME This inserts an unnecessary waitcnt
+      if (!TmpVGPRLive) {
+        I.addReg(TmpVGPR, RegState::Implicit);
+      }
+    } else {
+      // Restore inactive lanes
+      TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
+                                  /*IsKill*/ false);
+      auto I = BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+      if (!TmpVGPRLive) {
+        I.addReg(TmpVGPR, RegState::Implicit);
+      }
+      // Restore active lanes
+      if (TmpVGPRLive)
+        TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
+    }
+  }
+
+  // Write TmpVGPR to memory or read TmpVGPR from memory.
+  // Either using a single buffer_load/store if exec is set to the needed mask
+  // or using
+  // buffer_load
+  // s_not exec, exec
+  // buffer_load
+  // s_not exec, exec
+  void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
+    if (SavedExecReg) {
+      // Spill needed lanes
+      TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
+    } else {
+      // Spill active lanes
+      TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
+                                  /*IsKill*/ false);
+      // Spill inactive lanes
+      BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+      TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
+      BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+    }
+  }
+};
+
+} // namespace llvm
+
 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
     : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST),
       SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
@@ -1039,120 +1264,36 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
   }
 }
 
-// Generate a VMEM access which loads or stores the VGPR containing an SGPR
-// spill such that all the lanes set in VGPRLanes are loaded or stored.
-// This generates exec mask manipulation and will use SGPRs available in MI
-// or VGPR lanes in the VGPR to save and restore the exec mask.
-void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI,
-                                             int Index, int Offset,
-                                             unsigned EltSize, Register VGPR,
-                                             int64_t VGPRLanes,
-                                             RegScavenger *RS,
-                                             bool IsLoad) const {
-  MachineBasicBlock *MBB = MI->getParent();
-  MachineFunction *MF = MBB->getParent();
-  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
-  const SIInstrInfo *TII = ST.getInstrInfo();
-
-  Register SuperReg = MI->getOperand(0).getReg();
-  const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
-  ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
-  unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
-  unsigned FirstPart = Offset * 32;
-  unsigned ExecLane = 0;
-
-  bool IsKill = MI->getOperand(0).isKill();
-  const DebugLoc &DL = MI->getDebugLoc();
-
-  // Cannot handle load/store to EXEC
-  assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
-         SuperReg != AMDGPU::EXEC && "exec should never spill");
-
-  // On Wave32 only handle EXEC_LO.
-  // On Wave64 only update EXEC_HI if there is sufficent space for a copy.
-  bool OnlyExecLo = isWave32 || NumSubRegs == 1 || SuperReg == AMDGPU::EXEC_HI;
-
-  unsigned ExecMovOpc = OnlyExecLo ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-  Register ExecReg = OnlyExecLo ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-  Register SavedExecReg;
-
-  // Backup EXEC
-  if (OnlyExecLo) {
-    SavedExecReg =
-        NumSubRegs == 1
-            ? SuperReg
-            : Register(getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]));
-  } else {
-    // If src/dst is an odd size it is possible subreg0 is not aligned.
-    for (; ExecLane < (NumSubRegs - 1); ++ExecLane) {
-      SavedExecReg = getMatchingSuperReg(
-          getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]), AMDGPU::sub0,
-          &AMDGPU::SReg_64_XEXECRegClass);
-      if (SavedExecReg)
-        break;
-    }
-  }
-  assert(SavedExecReg);
-  BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), SavedExecReg).addReg(ExecReg);
-
-  // Setup EXEC
-  BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg).addImm(VGPRLanes);
-
+void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
+                                             int Offset, bool IsLoad,
+                                             bool IsKill) const {
   // Load/store VGPR
-  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+  MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
   assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
 
-  Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
-                          ? getBaseRegister()
-                          : getFrameRegister(*MF);
+  Register FrameReg =
+      FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
+          ? getBaseRegister()
+          : getFrameRegister(SB.MF);
 
   Align Alignment = FrameInfo.getObjectAlign(Index);
-  MachinePointerInfo PtrInfo =
-      MachinePointerInfo::getFixedStack(*MF, Index);
-  MachineMemOperand *MMO = MF->getMachineMemOperand(
+  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index);
+  MachineMemOperand *MMO = SB.MF.getMachineMemOperand(
       PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore,
-      EltSize, Alignment);
+      SB.EltSize, Alignment);
 
   if (IsLoad) {
     unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
                                           : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
-    buildSpillLoadStore(MI, Opc,
-          Index,
-          VGPR, false,
-          FrameReg,
-          Offset * EltSize, MMO,
-          RS);
+    buildSpillLoadStore(SB.MI, Opc, Index, SB.TmpVGPR, false, FrameReg,
+                        Offset * SB.EltSize, MMO, &SB.RS);
   } else {
     unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
                                           : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
-    buildSpillLoadStore(MI, Opc, Index, VGPR,
-                        IsKill, FrameReg,
-                        Offset * EltSize, MMO, RS);
+    buildSpillLoadStore(SB.MI, Opc, Index, SB.TmpVGPR, IsKill, FrameReg,
+                        Offset * SB.EltSize, MMO, &SB.RS);
     // This only ever adds one VGPR spill
-    MFI->addToSpilledVGPRs(1);
-  }
-
-  // Restore EXEC
-  BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg)
-      .addReg(SavedExecReg, getKillRegState(IsLoad || IsKill));
-
-  // Restore clobbered SGPRs
-  if (IsLoad) {
-    // Nothing to do; register will be overwritten
-  } else if (!IsKill) {
-    // Restore SGPRs from appropriate VGPR lanes
-    if (!OnlyExecLo) {
-      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
-              getSubReg(SuperReg, SplitParts[FirstPart + ExecLane + 1]))
-          .addReg(VGPR)
-          .addImm(ExecLane + 1);
-    }
-    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
-            NumSubRegs == 1 ? SavedExecReg
-                            : Register(getSubReg(
-                                  SuperReg, SplitParts[FirstPart + ExecLane])))
-        .addReg(VGPR, RegState::Kill)
-        .addImm(ExecLane);
+    SB.MFI.addToSpilledVGPRs(1);
   }
 }
 
@@ -1160,115 +1301,97 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
                                int Index,
                                RegScavenger *RS,
                                bool OnlyToVGPR) const {
-  MachineBasicBlock *MBB = MI->getParent();
-  MachineFunction *MF = MBB->getParent();
-  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, *RS);
 
-  ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
-    = MFI->getSGPRToVGPRSpills(Index);
+  ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills =
+      SB.MFI.getSGPRToVGPRSpills(Index);
   bool SpillToVGPR = !VGPRSpills.empty();
   if (OnlyToVGPR && !SpillToVGPR)
     return false;
 
-  const SIInstrInfo *TII = ST.getInstrInfo();
-
-  Register SuperReg = MI->getOperand(0).getReg();
-  bool IsKill = MI->getOperand(0).isKill();
-  const DebugLoc &DL = MI->getDebugLoc();
-
-  assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
-                         SuperReg != MFI->getFrameOffsetReg()));
-
-  assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
-  assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
-         SuperReg != AMDGPU::EXEC && "exec should never spill");
-
-  unsigned EltSize = 4;
-  const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
-
-  ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
-  unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
+  assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
+                         SB.SuperReg != SB.MFI.getFrameOffsetReg()));
 
   if (SpillToVGPR) {
-    for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
-      Register SubReg = NumSubRegs == 1
-                            ? SuperReg
-                            : Register(getSubReg(SuperReg, SplitParts[i]));
+    for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
+      Register SubReg =
+          SB.NumSubRegs == 1
+              ? SB.SuperReg
+              : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
 
-      bool UseKill = IsKill && i == NumSubRegs - 1;
+      bool UseKill = SB.IsKill && i == SB.NumSubRegs - 1;
 
       // Mark the "old value of vgpr" input undef only if this is the first sgpr
       // spill to this specific vgpr in the first basic block.
-      auto MIB =
-          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR)
-              .addReg(SubReg, getKillRegState(UseKill))
-              .addImm(Spill.Lane)
-              .addReg(Spill.VGPR);
+      auto MIB = BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
+                         Spill.VGPR)
+                     .addReg(SubReg, getKillRegState(UseKill))
+                     .addImm(Spill.Lane)
+                     .addReg(Spill.VGPR);
 
-      if (i == 0 && NumSubRegs > 1) {
+      if (i == 0 && SB.NumSubRegs > 1) {
         // We may be spilling a super-register which is only partially defined,
         // and need to ensure later spills think the value is defined.
-        MIB.addReg(SuperReg, RegState::ImplicitDefine);
+        MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
       }
 
-      if (NumSubRegs > 1)
-        MIB.addReg(SuperReg, getKillRegState(UseKill) | RegState::Implicit);
+      if (SB.NumSubRegs > 1)
+        MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit);
 
       // FIXME: Since this spills to another register instead of an actual
       // frame index, we should delete the frame index when all references to
       // it are fixed.
     }
   } else {
-    // Scavenged temporary VGPR to use. It must be scavenged once for any number
-    // of spilled subregs.
-    Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
-    RS->setRegUsed(TmpVGPR);
+    SB.prepare();
 
-    // SubReg carries the "Kill" flag when SubReg == SuperReg.
-    unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
+    // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
+    unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
 
-    unsigned PerVGPR = 32;
-    unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
-    int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
+    // Per VGPR helper data
+    auto PVD = SB.getPerVGPRData();
 
-    for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
+    for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
       unsigned TmpVGPRFlags = RegState::Undef;
 
       // Write sub registers into the VGPR
-      for (unsigned i = Offset * PerVGPR,
-                    e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
+      for (unsigned i = Offset * PVD.PerVGPR,
+                    e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
            i < e; ++i) {
-        Register SubReg = NumSubRegs == 1
-                              ? SuperReg
-                              : Register(getSubReg(SuperReg, SplitParts[i]));
+        Register SubReg =
+            SB.NumSubRegs == 1
+                ? SB.SuperReg
+                : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
 
         MachineInstrBuilder WriteLane =
-            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), TmpVGPR)
+            BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
+                    SB.TmpVGPR)
                 .addReg(SubReg, SubKillState)
-                .addImm(i % PerVGPR)
-                .addReg(TmpVGPR, TmpVGPRFlags);
+                .addImm(i % PVD.PerVGPR)
+                .addReg(SB.TmpVGPR, TmpVGPRFlags);
         TmpVGPRFlags = 0;
 
         // There could be undef components of a spilled super register.
         // TODO: Can we detect this and skip the spill?
-        if (NumSubRegs > 1) {
-          // The last implicit use of the SuperReg carries the "Kill" flag.
+        if (SB.NumSubRegs > 1) {
+          // The last implicit use of the SB.SuperReg carries the "Kill" flag.
           unsigned SuperKillState = 0;
-          if (i + 1 == NumSubRegs)
-            SuperKillState |= getKillRegState(IsKill);
-          WriteLane.addReg(SuperReg, RegState::Implicit | SuperKillState);
+          if (i + 1 == SB.NumSubRegs)
+            SuperKillState |= getKillRegState(SB.IsKill);
+          WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
         }
       }
 
       // Write out VGPR
-      buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
-                              RS, false);
+      SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
     }
+
+    SB.restore();
   }
 
   MI->eraseFromParent();
-  MFI->addToSpilledSGPRs(NumSubRegs);
+  SB.MFI.addToSpilledSGPRs(SB.NumSubRegs);
   return true;
 }
 
@@ -1276,75 +1399,59 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
                                  int Index,
                                  RegScavenger *RS,
                                  bool OnlyToVGPR) const {
-  MachineFunction *MF = MI->getParent()->getParent();
-  MachineBasicBlock *MBB = MI->getParent();
-  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, *RS);
 
-  ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
-    = MFI->getSGPRToVGPRSpills(Index);
+  ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills =
+      SB.MFI.getSGPRToVGPRSpills(Index);
   bool SpillToVGPR = !VGPRSpills.empty();
   if (OnlyToVGPR && !SpillToVGPR)
     return false;
 
-  const SIInstrInfo *TII = ST.getInstrInfo();
-  const DebugLoc &DL = MI->getDebugLoc();
-
-  Register SuperReg = MI->getOperand(0).getReg();
-
-  assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
-  assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
-         SuperReg != AMDGPU::EXEC && "exec should never spill");
-
-  unsigned EltSize = 4;
-
-  const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
-
-  ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
-  unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
-
   if (SpillToVGPR) {
-    for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
-      Register SubReg = NumSubRegs == 1
-                            ? SuperReg
-                            : Register(getSubReg(SuperReg, SplitParts[i]));
+    for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
+      Register SubReg =
+          SB.NumSubRegs == 1
+              ? SB.SuperReg
+              : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
 
       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
-      auto MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
-                     .addReg(Spill.VGPR)
-                     .addImm(Spill.Lane);
-      if (NumSubRegs > 1 && i == 0)
-        MIB.addReg(SuperReg, RegState::ImplicitDefine);
+      auto MIB =
+          BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), SubReg)
+              .addReg(Spill.VGPR)
+              .addImm(Spill.Lane);
+      if (SB.NumSubRegs > 1 && i == 0)
+        MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
     }
   } else {
-    Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
-    RS->setRegUsed(TmpVGPR);
+    SB.prepare();
 
-    unsigned PerVGPR = 32;
-    unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
-    int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
+    // Per VGPR helper data
+    auto PVD = SB.getPerVGPRData();
 
-    for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
+    for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
       // Load in VGPR data
-      buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
-                              RS, true);
+      SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
 
       // Unpack lanes
-      for (unsigned i = Offset * PerVGPR,
-                    e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
+      for (unsigned i = Offset * PVD.PerVGPR,
+                    e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
            i < e; ++i) {
-        Register SubReg = NumSubRegs == 1
-                              ? SuperReg
-                              : Register(getSubReg(SuperReg, SplitParts[i]));
+        Register SubReg =
+            SB.NumSubRegs == 1
+                ? SB.SuperReg
+                : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
 
         bool LastSubReg = (i + 1 == e);
-        auto MIB =
-            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
-                .addReg(TmpVGPR, getKillRegState(LastSubReg))
-                .addImm(i);
-        if (NumSubRegs > 1 && i == 0)
-          MIB.addReg(SuperReg, RegState::ImplicitDefine);
+        auto MIB = BuildMI(SB.MBB, MI, SB.DL,
+                           SB.TII.get(AMDGPU::V_READLANE_B32), SubReg)
+                       .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
+                       .addImm(i);
+        if (SB.NumSubRegs > 1 && i == 0)
+          MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
       }
     }
+
+    SB.restore();
   }
 
   MI->eraseFromParent();