Fix cThread related race conditions

Message ID fc149a8e3f6028824a6ea5beb4ca2dd2e67cb8be.1675370626.git.patrick9876@free.fr
State New
Headers
Series Fix cThread related race conditions |

Commit Message

Patrick Lerda Feb. 2, 2023, 8:56 p.m. UTC
  This change fixes the following issue:

==15457==ERROR: AddressSanitizer: SEGV on unknown address 0x0000000002d0 (pc 0x7fd2f4301710 bp 0x7fd2b5552a30 sp 0x7fd2b5552988 T228)
==15457==The signal is caused by a READ memory access.
==15457==Hint: address points to the zero page.
    #0 0x7fd2f4301710 in pthread_detach (/lib64/libc.so.6+0x8b710)
    #1 0xdb4430 in cThread::Start() vdr-2.6.3/thread.c:317
    #2 0x7fd2e5ea8fc4 in cIptvStreamer::Open() vdr-2.6.3/PLUGINS/src/iptv/streamer.c:66
    #3 0x7fd2e5e1e8c2 in cIptvDevice::OpenDvr() vdr-2.6.3/PLUGINS/src/iptv/device.c:342
    #4 0x634903 in cDevice::Action() vdr-2.6.3/device.c:1714
    #5 0xdb644a in cThread::StartThread(cThread*) vdr-2.6.3/thread.c:293
    #6 0x7fd2f4300729  (/lib64/libc.so.6+0x8a729)
    #7 0x7fd2f437d0bb  (/lib64/libc.so.6+0x1070bb)

AddressSanitizer can not provide additional info.
SUMMARY: AddressSanitizer: SEGV (/lib64/libc.so.6+0x8b710) in pthread_detach
Thread T228 (device 5 receiv) created by T222 (cLiveStreamer s) here:
    #0 0x7fd2f52d4716 in pthread_create (/usr/lib64/libasan.so.6+0x59716)
    #1 0xdb437b in cThread::Start() vdr-2.6.3/thread.c:316
    #2 0x63a3ad in cDevice::AttachReceiver(cReceiver*) vdr-2.6.3/device.c:1844
    #3 0x7fd2e8162c56 in cDummyReceiver::Create(cDevice*) vdr-2.6.3/PLUGINS/src/vnsiserver/videoinput.c:472
    #4 0x7fd2e816dbc2 in cVideoInput::Open(cChannel const*, int, cVideoBuffer*) vdr-2.6.3/PLUGINS/src/vnsiserver/videoinput.c:530
    #5 0x7fd2e80d78d3 in cLiveStreamer::Action() vdr-2.6.3/PLUGINS/src/vnsiserver/streamer.c:318
    #6 0xdb644a in cThread::StartThread(cThread*) vdr-2.6.3/thread.c:293
    #7 0x7fd2f4300729  (/lib64/libc.so.6+0x8a729)
---
 thread.c | 25 +++++++++++++++----------
 thread.h | 11 +++++++----
 2 files changed, 22 insertions(+), 14 deletions(-)
  

Comments

Klaus Schmidinger Feb. 15, 2023, 4:17 p.m. UTC | #1
On 02.02.23 21:56, Patrick Lerda wrote:
> ...
> diff --git a/thread.c b/thread.c
> index 93eb8c0..21be7a4 100644
> --- a/thread.c
> +++ b/thread.c
> @@ -312,13 +312,16 @@ bool cThread::Start(void)
>                 cCondWait::SleepMs(THREAD_STOP_SLEEP);
>           }
>        if (!active) {
> -        active = running = true;
> -        if (pthread_create(&childTid, NULL, (void *(*) (void *))&StartThread, (void *)this) == 0) {
> -           pthread_detach(childTid); // auto-reap
> +        pthread_t localTid;
> +        running = true;
> +        if (pthread_create(&localTid, NULL, (void *(*) (void *))&StartThread, (void *)this) == 0) {
> +           pthread_detach(localTid); // auto-reap
> +           childTid = localTid;
> +           active = true;
>              }
>           else {
>              LOG_ERROR;
> -           active = running = false;
> +           running = false;
>              return false;
>              }
>           }
> @@ -339,11 +342,12 @@ bool cThread::Active(void)
>        // performed but no signal is actually sent.
>        //
>        int err;
> -     if ((err = pthread_kill(childTid, 0)) != 0) {
> +     const pthread_t localTid = childTid;
> +     if ((err = pthread_kill(localTid, 0)) != 0) {
>           if (err != ESRCH)
>              LOG_ERROR;
> -        childTid = 0;
> -        active = running = false;
> +	if (active && childTid == localTid)

localTid was initialized to childTid 4 lines earlier, so what's with the "childTid == localTid" check here? Isn't this always true?

> +	  active = running = false;
>           }
>        else
>           return true;
> @@ -355,6 +359,7 @@ void cThread::Cancel(int WaitSeconds)
>   {
>     running = false;
>     if (active && WaitSeconds > -1) {
> +     const pthread_t localTid = childTid;
>        if (WaitSeconds > 0) {
>           for (time_t t0 = time(NULL) + WaitSeconds; time(NULL) < t0; ) {
>               if (!Active())
> @@ -363,9 +368,9 @@ void cThread::Cancel(int WaitSeconds)
>               }
>           esyslog("ERROR: %s thread %d won't end (waited %d seconds) - canceling it...", description ? description : "", childThreadId, WaitSeconds);
>           }
> -     pthread_cancel(childTid);
> -     childTid = 0;
> -     active = false;
> +     pthread_cancel(localTid);
> +     if (active && childTid == localTid)

Same here?

I see this happens with "address sanitizer". Is there an actual, reproducible, real world problem that this patch fixes?

> +       active = false;
>        }
>   }
>   
> diff --git a/thread.h b/thread.h
> index 16c4bd7..06046ea 100644
> --- a/thread.h
> +++ b/thread.h
> @@ -13,6 +13,7 @@
>   #include <pthread.h>
>   #include <stdio.h>
>   #include <sys/types.h>
> +#include <atomic>
>   
>   typedef pid_t tThreadId;
>   
> @@ -56,7 +57,7 @@ class cRwLock {
>   private:
>     pthread_rwlock_t rwlock;
>     int locked;
> -  tThreadId writeLockThreadId;
> +  std::atomic<tThreadId> writeLockThreadId;
>   public:
>     cRwLock(bool PreferWriter = false);
>     ~cRwLock();
> @@ -79,9 +80,11 @@ public:
>   class cThread {
>     friend class cThreadLock;
>   private:
> -  bool active;
> -  bool running;
> -  pthread_t childTid;
> +  std::atomic_bool active;
> +  std::atomic_bool running;
> +  std::atomic<pthread_t> childTid;
> +       ///< Assume that the content of childTid is valid when the class member
> +       ///< active is set to true and undefined otherwise.
>     tThreadId childThreadId;
>     cMutex mutex;
>     char *description;

Are the "atomics" really necessary?

Klaus
  
Marko Mäkelä Feb. 18, 2023, 10:10 a.m. UTC | #2
Wed, Feb 15, 2023 at 05:17:55PM +0100, Klaus Schmidinger wrote:
>On 02.02.23 21:56, Patrick Lerda wrote:
>>...
>>diff --git a/thread.c b/thread.c
>>index 93eb8c0..21be7a4 100644
>>--- a/thread.c
>>+++ b/thread.c
>>@@ -312,13 +312,16 @@ bool cThread::Start(void)
>>                cCondWait::SleepMs(THREAD_STOP_SLEEP);
>>          }
>>       if (!active) {
>>-        active = running = true;
>>-        if (pthread_create(&childTid, NULL, (void *(*) (void *))&StartThread, (void *)this) == 0) {
>>-           pthread_detach(childTid); // auto-reap
>>+        pthread_t localTid;
>>+        running = true;
>>+        if (pthread_create(&localTid, NULL, (void *(*) (void *))&StartThread, (void *)this) == 0) {
>>+           pthread_detach(localTid); // auto-reap
>>+           childTid = localTid;
>>+           active = true;
[snip]
>>+     if ((err = pthread_kill(localTid, 0)) != 0) {
>>          if (err != ESRCH)
>>             LOG_ERROR;
>>-        childTid = 0;
>>-        active = running = false;
>>+	if (active && childTid == localTid)
>
>localTid was initialized to childTid 4 lines earlier, so what's with 
>the "childTid == localTid" check here? Isn't this always true?

cThread::childTid may be modified by another thread that is executing 
cThread::Action() inside cThread::StartThread().

Thinking aloud: Do we need "bool active", or could "childTid!=0" take 
its role?

>I see this happens with "address sanitizer". Is there an actual, 
>reproducible, real world problem that this patch fixes?

AddressSanitizer only changes some timing characteristics. It should not 
have any direct impact on the possibility of race conditions.

I can agree with your questioning of ThreadSanitizer findings, but I 
think that AddressSanitizer needs to be taken seriously.

For a while, in a multi-threaded piece of software that I maintain, 
AddressSanitizer seemed to issue bogus errors. The reason was a race 
condition around some additional instrumentation to declare memory 
inside a custom allocator as "not allocated", by 
ASAN_POISON_MEMORY_REGION() and ASAN_UNPOISON_MEMORY_REGION(). Ever 
since the code was changed so that we will not shortly poison everything 
and then unpoison the safe bits, but just poison the unsafe bits, 
-fsanitizer=address has only reported real problems.

>Are the "atomics" really necessary?

Before C++11, I would think that multiple threads were out of the scope 
of the standard, in the "implementation defined" territory, which is 
kind-of "not even wrong". Now that C++ since the 2011 version covers 
multi-threaded execution, data races are unambiguously "wrong", that is, 
"undefined behaviour".

The way the patch uses std::atomic may be an overkill. While 
std::memory_order_seq_cst (the default) may make little difference to 
ISAs that implement a strong memory model (SPARC, IA-32, AMD64), it can 
be an unnecessary burden on ARM, POWER, RISC-V RVWMO.

If we are only interested in a data field itself and not other memory 
that it may be "protecting" in some other cache line, 
std::memory_order_relaxed should suffice.

If you are running on a single-core or single-socket IA-32 or AMD64 CPU, 
all this should not make much difference. There could already be 
sufficient "compiler barriers" around the code.

Proper use of memory barriers or atomics might fix some rare hangs or 
crashes on startup or shutdown on multi-core ARM system, such as a 
Raspberry Pi.

Race conditions do not always lead to crashes; they could lead to 
slowness or busy loops as well. Just an example: After I fixed a few 
things in the rpihddevice plugin, I can reliably play back or 
fast-forward recordings to the very end, with no premature interruption 
or excessive delays.

I recommend the following to anyone who is working on multi-threaded 
code, especially lock-free data structures and algorithms:

https://en.cppreference.com/w/cpp/atomic/memory_order
https://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
https://www.cl.cam.ac.uk/~pes20/ppc-supplemental/test7.pdf

I find it easier to use a mutex or rw-lock to protect shared data 
structures, and to document how the data is being protected. Atomic 
memory access operations usually come into play when there are 
performance bottlenecks that need to be fixed.

	Marko
  
Patrick Lerda Feb. 19, 2023, 5:01 p.m. UTC | #3
On 15/02/2023 17:17, Klaus Schmidinger wrote:
> On 02.02.23 21:56, Patrick Lerda wrote:
>> ...
>> diff --git a/thread.c b/thread.c
>> index 93eb8c0..21be7a4 100644
>> --- a/thread.c
>> +++ b/thread.c
>> @@ -312,13 +312,16 @@ bool cThread::Start(void)
>>                 cCondWait::SleepMs(THREAD_STOP_SLEEP);
>>           }
>>        if (!active) {
>> -        active = running = true;
>> -        if (pthread_create(&childTid, NULL, (void *(*) (void 
>> *))&StartThread, (void *)this) == 0) {
>> -           pthread_detach(childTid); // auto-reap
>> +        pthread_t localTid;
>> +        running = true;
>> +        if (pthread_create(&localTid, NULL, (void *(*) (void 
>> *))&StartThread, (void *)this) == 0) {
>> +           pthread_detach(localTid); // auto-reap
>> +           childTid = localTid;
>> +           active = true;
>>              }
>>           else {
>>              LOG_ERROR;
>> -           active = running = false;
>> +           running = false;
>>              return false;
>>              }
>>           }
>> @@ -339,11 +342,12 @@ bool cThread::Active(void)
>>        // performed but no signal is actually sent.
>>        //
>>        int err;
>> -     if ((err = pthread_kill(childTid, 0)) != 0) {
>> +     const pthread_t localTid = childTid;
>> +     if ((err = pthread_kill(localTid, 0)) != 0) {
>>           if (err != ESRCH)
>>              LOG_ERROR;
>> -        childTid = 0;
>> -        active = running = false;
>> +	if (active && childTid == localTid)
> 
> localTid was initialized to childTid 4 lines earlier, so what's with
> the "childTid == localTid" check here? Isn't this always true?
> 

The function pthread_kill() is not instantaneous and could pause the 
thread, this condition ensures that childTid was not updated in the 
meantime by a second thread.

>> +	  active = running = false;
>>           }
>>        else
>>           return true;
>> @@ -355,6 +359,7 @@ void cThread::Cancel(int WaitSeconds)
>>   {
>>     running = false;
>>     if (active && WaitSeconds > -1) {
>> +     const pthread_t localTid = childTid;
>>        if (WaitSeconds > 0) {
>>           for (time_t t0 = time(NULL) + WaitSeconds; time(NULL) < t0; 
>> ) {
>>               if (!Active())
>> @@ -363,9 +368,9 @@ void cThread::Cancel(int WaitSeconds)
>>               }
>>           esyslog("ERROR: %s thread %d won't end (waited %d seconds) - 
>> canceling it...", description ? description : "", childThreadId, 
>> WaitSeconds);
>>           }
>> -     pthread_cancel(childTid);
>> -     childTid = 0;
>> -     active = false;
>> +     pthread_cancel(localTid);
>> +     if (active && childTid == localTid)
> 
> Same here?
> 
> I see this happens with "address sanitizer". Is there an actual,
> reproducible, real world problem that this patch fixes?
> 

I had a few random crashes related to cThread, so it could happen. 
Anyway the sanitizer seems to increase the likelihood of these events.

>> +       active = false;
>>        }
>>   }
>>   diff --git a/thread.h b/thread.h
>> index 16c4bd7..06046ea 100644
>> --- a/thread.h
>> +++ b/thread.h
>> @@ -13,6 +13,7 @@
>>   #include <pthread.h>
>>   #include <stdio.h>
>>   #include <sys/types.h>
>> +#include <atomic>
>>     typedef pid_t tThreadId;
>>   @@ -56,7 +57,7 @@ class cRwLock {
>>   private:
>>     pthread_rwlock_t rwlock;
>>     int locked;
>> -  tThreadId writeLockThreadId;
>> +  std::atomic<tThreadId> writeLockThreadId;
>>   public:
>>     cRwLock(bool PreferWriter = false);
>>     ~cRwLock();
>> @@ -79,9 +80,11 @@ public:
>>   class cThread {
>>     friend class cThreadLock;
>>   private:
>> -  bool active;
>> -  bool running;
>> -  pthread_t childTid;
>> +  std::atomic_bool active;
>> +  std::atomic_bool running;
>> +  std::atomic<pthread_t> childTid;
>> +       ///< Assume that the content of childTid is valid when the 
>> class member
>> +       ///< active is set to true and undefined otherwise.
>>     tThreadId childThreadId;
>>     cMutex mutex;
>>     char *description;
> 
> Are the "atomics" really necessary?
> 

Atomic is mandatory, the compiler has to generate the proper op codes to 
ensure "atomic" operations at the thread level.

Patrick


> Klaus
> 
> 
> _______________________________________________
> vdr mailing list
> vdr@linuxtv.org
> https://www.linuxtv.org/cgi-bin/mailman/listinfo/vdr
  
Patrick Lerda Feb. 19, 2023, 5:17 p.m. UTC | #4
On 18/02/2023 11:10, Marko Mäkelä wrote:
> Wed, Feb 15, 2023 at 05:17:55PM +0100, Klaus Schmidinger wrote:
>> On 02.02.23 21:56, Patrick Lerda wrote:
>>> ...
>>> diff --git a/thread.c b/thread.c
>>> index 93eb8c0..21be7a4 100644
>>> --- a/thread.c
>>> +++ b/thread.c
>>> @@ -312,13 +312,16 @@ bool cThread::Start(void)
>>>                cCondWait::SleepMs(THREAD_STOP_SLEEP);
>>>          }
>>>       if (!active) {
>>> -        active = running = true;
>>> -        if (pthread_create(&childTid, NULL, (void *(*) (void 
>>> *))&StartThread, (void *)this) == 0) {
>>> -           pthread_detach(childTid); // auto-reap
>>> +        pthread_t localTid;
>>> +        running = true;
>>> +        if (pthread_create(&localTid, NULL, (void *(*) (void 
>>> *))&StartThread, (void *)this) == 0) {
>>> +           pthread_detach(localTid); // auto-reap
>>> +           childTid = localTid;
>>> +           active = true;
> [snip]
>>> +     if ((err = pthread_kill(localTid, 0)) != 0) {
>>>          if (err != ESRCH)
>>>             LOG_ERROR;
>>> -        childTid = 0;
>>> -        active = running = false;
>>> +	if (active && childTid == localTid)
>> 
>> localTid was initialized to childTid 4 lines earlier, so what's with 
>> the "childTid == localTid" check here? Isn't this always true?
> 
> cThread::childTid may be modified by another thread that is executing
> cThread::Action() inside cThread::StartThread().
> 
> Thinking aloud: Do we need "bool active", or could "childTid!=0" take 
> its role?

Using childTid=0 is unreliable, assuming that childTid is valid when 
active=true is better.

> 
>> I see this happens with "address sanitizer". Is there an actual, 
>> reproducible, real world problem that this patch fixes?
> 
> AddressSanitizer only changes some timing characteristics. It should
> not have any direct impact on the possibility of race conditions.
> 
> I can agree with your questioning of ThreadSanitizer findings, but I
> think that AddressSanitizer needs to be taken seriously.
> 
> For a while, in a multi-threaded piece of software that I maintain,
> AddressSanitizer seemed to issue bogus errors. The reason was a race
> condition around some additional instrumentation to declare memory
> inside a custom allocator as "not allocated", by
> ASAN_POISON_MEMORY_REGION() and ASAN_UNPOISON_MEMORY_REGION(). Ever
> since the code was changed so that we will not shortly poison
> everything and then unpoison the safe bits, but just poison the unsafe
> bits, -fsanitizer=address has only reported real problems.
> 
>> Are the "atomics" really necessary?
> 
> Before C++11, I would think that multiple threads were out of the
> scope of the standard, in the "implementation defined" territory,
> which is kind-of "not even wrong". Now that C++ since the 2011 version
> covers multi-threaded execution, data races are unambiguously "wrong",
> that is, "undefined behaviour".
> 
> The way the patch uses std::atomic may be an overkill. While
> std::memory_order_seq_cst (the default) may make little difference to
> ISAs that implement a strong memory model (SPARC, IA-32, AMD64), it
> can be an unnecessary burden on ARM, POWER, RISC-V RVWMO.
> 
> If we are only interested in a data field itself and not other memory
> that it may be "protecting" in some other cache line,
> std::memory_order_relaxed should suffice.
> 
> If you are running on a single-core or single-socket IA-32 or AMD64
> CPU, all this should not make much difference. There could already be
> sufficient "compiler barriers" around the code.
> 
> Proper use of memory barriers or atomics might fix some rare hangs or
> crashes on startup or shutdown on multi-core ARM system, such as a
> Raspberry Pi.
> 
> Race conditions do not always lead to crashes; they could lead to
> slowness or busy loops as well. Just an example: After I fixed a few
> things in the rpihddevice plugin, I can reliably play back or
> fast-forward recordings to the very end, with no premature
> interruption or excessive delays.
> 
> I recommend the following to anyone who is working on multi-threaded
> code, especially lock-free data structures and algorithms:
> 
> https://en.cppreference.com/w/cpp/atomic/memory_order
> https://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
> https://www.cl.cam.ac.uk/~pes20/ppc-supplemental/test7.pdf
> 
> I find it easier to use a mutex or rw-lock to protect shared data
> structures, and to document how the data is being protected. Atomic
> memory access operations usually come into play when there are
> performance bottlenecks that need to be fixed.
> 
> 	Marko
> 

I agree with you, another implementation could work as well.

Patrick.

> _______________________________________________
> vdr mailing list
> vdr@linuxtv.org
> https://www.linuxtv.org/cgi-bin/mailman/listinfo/vdr
  

Patch

diff --git a/thread.c b/thread.c
index 93eb8c0..21be7a4 100644
--- a/thread.c
+++ b/thread.c
@@ -312,13 +312,16 @@  bool cThread::Start(void)
               cCondWait::SleepMs(THREAD_STOP_SLEEP);
         }
      if (!active) {
-        active = running = true;
-        if (pthread_create(&childTid, NULL, (void *(*) (void *))&StartThread, (void *)this) == 0) {
-           pthread_detach(childTid); // auto-reap
+        pthread_t localTid;
+        running = true;
+        if (pthread_create(&localTid, NULL, (void *(*) (void *))&StartThread, (void *)this) == 0) {
+           pthread_detach(localTid); // auto-reap
+           childTid = localTid;
+           active = true;
            }
         else {
            LOG_ERROR;
-           active = running = false;
+           running = false;
            return false;
            }
         }
@@ -339,11 +342,12 @@  bool cThread::Active(void)
      // performed but no signal is actually sent.
      //
      int err;
-     if ((err = pthread_kill(childTid, 0)) != 0) {
+     const pthread_t localTid = childTid;
+     if ((err = pthread_kill(localTid, 0)) != 0) {
         if (err != ESRCH)
            LOG_ERROR;
-        childTid = 0;
-        active = running = false;
+	if (active && childTid == localTid)
+	  active = running = false;
         }
      else
         return true;
@@ -355,6 +359,7 @@  void cThread::Cancel(int WaitSeconds)
 {
   running = false;
   if (active && WaitSeconds > -1) {
+     const pthread_t localTid = childTid;
      if (WaitSeconds > 0) {
         for (time_t t0 = time(NULL) + WaitSeconds; time(NULL) < t0; ) {
             if (!Active())
@@ -363,9 +368,9 @@  void cThread::Cancel(int WaitSeconds)
             }
         esyslog("ERROR: %s thread %d won't end (waited %d seconds) - canceling it...", description ? description : "", childThreadId, WaitSeconds);
         }
-     pthread_cancel(childTid);
-     childTid = 0;
-     active = false;
+     pthread_cancel(localTid);
+     if (active && childTid == localTid)
+       active = false;
      }
 }
 
diff --git a/thread.h b/thread.h
index 16c4bd7..06046ea 100644
--- a/thread.h
+++ b/thread.h
@@ -13,6 +13,7 @@ 
 #include <pthread.h>
 #include <stdio.h>
 #include <sys/types.h>
+#include <atomic>
 
 typedef pid_t tThreadId;
 
@@ -56,7 +57,7 @@  class cRwLock {
 private:
   pthread_rwlock_t rwlock;
   int locked;
-  tThreadId writeLockThreadId;
+  std::atomic<tThreadId> writeLockThreadId;
 public:
   cRwLock(bool PreferWriter = false);
   ~cRwLock();
@@ -79,9 +80,11 @@  public:
 class cThread {
   friend class cThreadLock;
 private:
-  bool active;
-  bool running;
-  pthread_t childTid;
+  std::atomic_bool active;
+  std::atomic_bool running;
+  std::atomic<pthread_t> childTid;
+       ///< Assume that the content of childTid is valid when the class member
+       ///< active is set to true and undefined otherwise.
   tThreadId childThreadId;
   cMutex mutex;
   char *description;